feat: implement pod-based scanning architecture

This major refactor moves from synchronous subprocess-based scanning to
asynchronous pod-based scanning using Kubernetes Jobs.

## Architecture Changes
- Scanner jobs are now Kubernetes Jobs with TTLAfterFinished for automatic cleanup
- Jobs have owner references for garbage collection when NucleiScan is deleted
- Configurable concurrency limits, timeouts, and resource requirements

## New Features
- Dual-mode binary: --mode=controller (default) or --mode=scanner
- Annotation-based configuration for Ingress/VirtualService resources
- Operator-level configuration via environment variables
- Startup recovery for orphaned scans after operator restart
- Periodic cleanup of stuck jobs

## New Files
- DESIGN.md: Comprehensive architecture design document
- internal/jobmanager/: Job Manager for creating/monitoring scanner jobs
- internal/scanner/runner.go: Scanner mode implementation
- internal/annotations/: Annotation parsing utilities
- charts/nuclei-operator/templates/scanner-rbac.yaml: Scanner RBAC

## API Changes
- Added ScannerConfig struct for per-scan scanner configuration
- Added JobReference struct for tracking scanner jobs
- Added ScannerConfig field to NucleiScanSpec
- Added JobRef and ScanStartTime fields to NucleiScanStatus

## Supported Annotations
- nuclei.homelab.mortenolsen.pro/enabled
- nuclei.homelab.mortenolsen.pro/templates
- nuclei.homelab.mortenolsen.pro/severity
- nuclei.homelab.mortenolsen.pro/schedule
- nuclei.homelab.mortenolsen.pro/timeout
- nuclei.homelab.mortenolsen.pro/scanner-image

## RBAC Updates
- Added Job and Pod permissions for operator
- Created separate scanner service account with minimal permissions

## Documentation
- Updated README, user-guide, api.md, and Helm chart README
- Added example annotated Ingress resources
This commit is contained in:
Morten Olsen
2025-12-12 20:51:23 +01:00
parent 519ed32de3
commit 12d681ada1
22 changed files with 3060 additions and 245 deletions

View File

@@ -0,0 +1,427 @@
/*
Copyright 2024.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package jobmanager
import (
"context"
"fmt"
"time"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1"
)
const (
// DefaultScannerImage is the default image used for scanner pods
DefaultScannerImage = "ghcr.io/morten-olsen/nuclei-operator:latest"
// DefaultTimeout is the default scan timeout
DefaultTimeout = 30 * time.Minute
// DefaultTTLAfterFinished is the default TTL for completed jobs
DefaultTTLAfterFinished = 3600 // 1 hour
// DefaultBackoffLimit is the default number of retries for failed jobs
DefaultBackoffLimit = 2
// LabelManagedBy is the label key for identifying managed resources
LabelManagedBy = "app.kubernetes.io/managed-by"
// LabelComponent is the label key for component identification
LabelComponent = "app.kubernetes.io/component"
// LabelScanName is the label key for the scan name
LabelScanName = "nuclei.homelab.mortenolsen.pro/scan-name"
// LabelScanNamespace is the label key for the scan namespace
LabelScanNamespace = "nuclei.homelab.mortenolsen.pro/scan-namespace"
)
// Config holds the configuration for the JobManager
type Config struct {
// ScannerImage is the default image to use for scanner pods
ScannerImage string
// DefaultTimeout is the default scan timeout
DefaultTimeout time.Duration
// TTLAfterFinished is the TTL for completed jobs in seconds
TTLAfterFinished int32
// BackoffLimit is the number of retries for failed jobs
BackoffLimit int32
// MaxConcurrent is the maximum number of concurrent scan jobs
MaxConcurrent int
// ServiceAccountName is the service account to use for scanner pods
ServiceAccountName string
// DefaultResources are the default resource requirements for scanner pods
DefaultResources corev1.ResourceRequirements
// DefaultTemplates are the default templates to use for scans
DefaultTemplates []string
// DefaultSeverity is the default severity filter
DefaultSeverity []string
}
// DefaultConfig returns a Config with default values
func DefaultConfig() Config {
return Config{
ScannerImage: DefaultScannerImage,
DefaultTimeout: DefaultTimeout,
TTLAfterFinished: DefaultTTLAfterFinished,
BackoffLimit: DefaultBackoffLimit,
MaxConcurrent: 5,
ServiceAccountName: "nuclei-scanner",
DefaultResources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("100m"),
corev1.ResourceMemory: resource.MustParse("256Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
corev1.ResourceMemory: resource.MustParse("1Gi"),
},
},
}
}
// JobManager manages scanner jobs for NucleiScan resources
type JobManager struct {
client.Client
Scheme *runtime.Scheme
Config Config
}
// NewJobManager creates a new JobManager with the given configuration
func NewJobManager(c client.Client, scheme *runtime.Scheme, config Config) *JobManager {
return &JobManager{
Client: c,
Scheme: scheme,
Config: config,
}
}
// CreateScanJob creates a new scanner job for the given NucleiScan
func (m *JobManager) CreateScanJob(ctx context.Context, scan *nucleiv1alpha1.NucleiScan) (*batchv1.Job, error) {
logger := log.FromContext(ctx)
job := m.buildJob(scan)
// Set owner reference so the job is garbage collected when the scan is deleted
if err := controllerutil.SetControllerReference(scan, job, m.Scheme); err != nil {
return nil, fmt.Errorf("failed to set controller reference: %w", err)
}
logger.Info("Creating scanner job",
"job", job.Name,
"namespace", job.Namespace,
"image", job.Spec.Template.Spec.Containers[0].Image,
"targets", len(scan.Spec.Targets))
if err := m.Create(ctx, job); err != nil {
return nil, fmt.Errorf("failed to create job: %w", err)
}
return job, nil
}
// GetJob retrieves a job by name and namespace
func (m *JobManager) GetJob(ctx context.Context, name, namespace string) (*batchv1.Job, error) {
job := &batchv1.Job{}
err := m.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, job)
if err != nil {
return nil, err
}
return job, nil
}
// DeleteJob deletes a job by name and namespace
func (m *JobManager) DeleteJob(ctx context.Context, name, namespace string) error {
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
},
}
return m.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground))
}
// GetJobPodName returns the name of the pod created by the job
func (m *JobManager) GetJobPodName(ctx context.Context, job *batchv1.Job) (string, error) {
podList := &corev1.PodList{}
err := m.List(ctx, podList,
client.InNamespace(job.Namespace),
client.MatchingLabels{"job-name": job.Name})
if err != nil {
return "", err
}
if len(podList.Items) == 0 {
return "", nil
}
// Return the first pod (there should only be one for our jobs)
return podList.Items[0].Name, nil
}
// IsJobComplete returns true if the job has completed (successfully or failed)
func (m *JobManager) IsJobComplete(job *batchv1.Job) bool {
for _, condition := range job.Status.Conditions {
if (condition.Type == batchv1.JobComplete || condition.Type == batchv1.JobFailed) &&
condition.Status == corev1.ConditionTrue {
return true
}
}
return false
}
// IsJobSuccessful returns true if the job completed successfully
func (m *JobManager) IsJobSuccessful(job *batchv1.Job) bool {
for _, condition := range job.Status.Conditions {
if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue {
return true
}
}
return false
}
// IsJobFailed returns true if the job failed
func (m *JobManager) IsJobFailed(job *batchv1.Job) bool {
for _, condition := range job.Status.Conditions {
if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue {
return true
}
}
return false
}
// GetJobFailureReason returns the reason for job failure
func (m *JobManager) GetJobFailureReason(job *batchv1.Job) string {
for _, condition := range job.Status.Conditions {
if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue {
return condition.Message
}
}
return "Unknown failure reason"
}
// CountActiveJobs returns the number of currently active scan jobs
func (m *JobManager) CountActiveJobs(ctx context.Context) (int, error) {
jobList := &batchv1.JobList{}
err := m.List(ctx, jobList, client.MatchingLabels{
LabelManagedBy: "nuclei-operator",
LabelComponent: "scanner",
})
if err != nil {
return 0, err
}
count := 0
for _, job := range jobList.Items {
if job.Status.Active > 0 {
count++
}
}
return count, nil
}
// AtCapacity returns true if the maximum number of concurrent jobs has been reached
func (m *JobManager) AtCapacity(ctx context.Context) (bool, error) {
count, err := m.CountActiveJobs(ctx)
if err != nil {
return false, err
}
return count >= m.Config.MaxConcurrent, nil
}
// CleanupOrphanedJobs removes jobs that no longer have an associated NucleiScan
func (m *JobManager) CleanupOrphanedJobs(ctx context.Context) error {
logger := log.FromContext(ctx)
jobList := &batchv1.JobList{}
err := m.List(ctx, jobList, client.MatchingLabels{
LabelManagedBy: "nuclei-operator",
LabelComponent: "scanner",
})
if err != nil {
return err
}
for _, job := range jobList.Items {
// Check if owner reference exists and the owner still exists
ownerRef := metav1.GetControllerOf(&job)
if ownerRef == nil {
logger.Info("Deleting orphaned job without owner", "job", job.Name, "namespace", job.Namespace)
if err := m.DeleteJob(ctx, job.Name, job.Namespace); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Failed to delete orphaned job", "job", job.Name)
}
continue
}
// Check if the job is stuck (running longer than 2x the timeout)
if job.Status.StartTime != nil {
maxDuration := 2 * m.Config.DefaultTimeout
if time.Since(job.Status.StartTime.Time) > maxDuration && job.Status.Active > 0 {
logger.Info("Deleting stuck job", "job", job.Name, "namespace", job.Namespace,
"age", time.Since(job.Status.StartTime.Time))
if err := m.DeleteJob(ctx, job.Name, job.Namespace); err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Failed to delete stuck job", "job", job.Name)
}
}
}
}
return nil
}
// buildJob creates a Job specification for the given NucleiScan
func (m *JobManager) buildJob(scan *nucleiv1alpha1.NucleiScan) *batchv1.Job {
// Generate a unique job name
jobName := fmt.Sprintf("nucleiscan-%s-%d", scan.Name, time.Now().Unix())
if len(jobName) > 63 {
jobName = jobName[:63]
}
// Determine the scanner image
image := m.Config.ScannerImage
if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Image != "" {
image = scan.Spec.ScannerConfig.Image
}
// Determine timeout
timeout := m.Config.DefaultTimeout
if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Timeout != nil {
timeout = scan.Spec.ScannerConfig.Timeout.Duration
}
activeDeadlineSeconds := int64(timeout.Seconds())
// Determine resources
resources := m.Config.DefaultResources
if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Resources != nil {
resources = *scan.Spec.ScannerConfig.Resources
}
// Build command arguments for scanner mode
args := []string{
"--mode=scanner",
fmt.Sprintf("--scan-name=%s", scan.Name),
fmt.Sprintf("--scan-namespace=%s", scan.Namespace),
}
// Build labels
labels := map[string]string{
LabelManagedBy: "nuclei-operator",
LabelComponent: "scanner",
LabelScanName: scan.Name,
LabelScanNamespace: scan.Namespace,
}
// Build node selector
var nodeSelector map[string]string
if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.NodeSelector != nil {
nodeSelector = scan.Spec.ScannerConfig.NodeSelector
}
// Build tolerations
var tolerations []corev1.Toleration
if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Tolerations != nil {
tolerations = scan.Spec.ScannerConfig.Tolerations
}
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
Namespace: scan.Namespace,
Labels: labels,
},
Spec: batchv1.JobSpec{
TTLSecondsAfterFinished: ptr.To(m.Config.TTLAfterFinished),
BackoffLimit: ptr.To(m.Config.BackoffLimit),
ActiveDeadlineSeconds: &activeDeadlineSeconds,
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: labels,
},
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyNever,
ServiceAccountName: m.Config.ServiceAccountName,
NodeSelector: nodeSelector,
Tolerations: tolerations,
SecurityContext: &corev1.PodSecurityContext{
RunAsNonRoot: ptr.To(true),
RunAsUser: ptr.To(int64(65532)),
RunAsGroup: ptr.To(int64(65532)),
FSGroup: ptr.To(int64(65532)),
SeccompProfile: &corev1.SeccompProfile{
Type: corev1.SeccompProfileTypeRuntimeDefault,
},
},
Containers: []corev1.Container{
{
Name: "scanner",
Image: image,
Args: args,
Resources: resources,
SecurityContext: &corev1.SecurityContext{
AllowPrivilegeEscalation: ptr.To(false),
ReadOnlyRootFilesystem: ptr.To(false), // Nuclei needs temp files
Capabilities: &corev1.Capabilities{
Drop: []corev1.Capability{"ALL"},
},
},
Env: []corev1.EnvVar{
{
Name: "POD_NAME",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.name",
},
},
},
{
Name: "POD_NAMESPACE",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.namespace",
},
},
},
},
},
},
},
},
},
}
return job
}