/* Copyright 2024. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package jobmanager import ( "context" "fmt" "time" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1" ) const ( // DefaultScannerImage is the default image used for scanner pods DefaultScannerImage = "ghcr.io/morten-olsen/nuclei-operator:latest" // DefaultTimeout is the default scan timeout DefaultTimeout = 30 * time.Minute // DefaultTTLAfterFinished is the default TTL for completed jobs DefaultTTLAfterFinished = 3600 // 1 hour // DefaultBackoffLimit is the default number of retries for failed jobs DefaultBackoffLimit = 2 // LabelManagedBy is the label key for identifying managed resources LabelManagedBy = "app.kubernetes.io/managed-by" // LabelComponent is the label key for component identification LabelComponent = "app.kubernetes.io/component" // LabelScanName is the label key for the scan name LabelScanName = "nuclei.homelab.mortenolsen.pro/scan-name" // LabelScanNamespace is the label key for the scan namespace LabelScanNamespace = "nuclei.homelab.mortenolsen.pro/scan-namespace" ) // Config holds the configuration for the JobManager type Config struct { // ScannerImage is the default image to use for scanner pods ScannerImage string // DefaultTimeout is the default scan timeout DefaultTimeout time.Duration // TTLAfterFinished is the TTL for completed jobs in seconds TTLAfterFinished int32 // BackoffLimit is the number of retries for failed jobs BackoffLimit int32 // MaxConcurrent is the maximum number of concurrent scan jobs MaxConcurrent int // ServiceAccountName is the service account to use for scanner pods ServiceAccountName string // DefaultResources are the default resource requirements for scanner pods DefaultResources corev1.ResourceRequirements // DefaultTemplates are the default templates to use for scans DefaultTemplates []string // DefaultSeverity is the default severity filter DefaultSeverity []string } // DefaultConfig returns a Config with default values func DefaultConfig() Config { return Config{ ScannerImage: DefaultScannerImage, DefaultTimeout: DefaultTimeout, TTLAfterFinished: DefaultTTLAfterFinished, BackoffLimit: DefaultBackoffLimit, MaxConcurrent: 5, ServiceAccountName: "nuclei-scanner", DefaultResources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("100m"), corev1.ResourceMemory: resource.MustParse("256Mi"), }, Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("1"), corev1.ResourceMemory: resource.MustParse("1Gi"), }, }, } } // JobManager manages scanner jobs for NucleiScan resources type JobManager struct { client.Client Scheme *runtime.Scheme Config Config } // NewJobManager creates a new JobManager with the given configuration func NewJobManager(c client.Client, scheme *runtime.Scheme, config Config) *JobManager { return &JobManager{ Client: c, Scheme: scheme, Config: config, } } // CreateScanJob creates a new scanner job for the given NucleiScan func (m *JobManager) CreateScanJob(ctx context.Context, scan *nucleiv1alpha1.NucleiScan) (*batchv1.Job, error) { logger := log.FromContext(ctx) job := m.buildJob(scan) // Set owner reference so the job is garbage collected when the scan is deleted if err := controllerutil.SetControllerReference(scan, job, m.Scheme); err != nil { return nil, fmt.Errorf("failed to set controller reference: %w", err) } logger.Info("Creating scanner job", "job", job.Name, "namespace", job.Namespace, "image", job.Spec.Template.Spec.Containers[0].Image, "targets", len(scan.Spec.Targets)) if err := m.Create(ctx, job); err != nil { return nil, fmt.Errorf("failed to create job: %w", err) } return job, nil } // GetJob retrieves a job by name and namespace func (m *JobManager) GetJob(ctx context.Context, name, namespace string) (*batchv1.Job, error) { job := &batchv1.Job{} err := m.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, job) if err != nil { return nil, err } return job, nil } // DeleteJob deletes a job by name and namespace func (m *JobManager) DeleteJob(ctx context.Context, name, namespace string) error { job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: namespace, }, } return m.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)) } // GetJobPodName returns the name of the pod created by the job func (m *JobManager) GetJobPodName(ctx context.Context, job *batchv1.Job) (string, error) { podList := &corev1.PodList{} err := m.List(ctx, podList, client.InNamespace(job.Namespace), client.MatchingLabels{"job-name": job.Name}) if err != nil { return "", err } if len(podList.Items) == 0 { return "", nil } // Return the first pod (there should only be one for our jobs) return podList.Items[0].Name, nil } // IsJobComplete returns true if the job has completed (successfully or failed) func (m *JobManager) IsJobComplete(job *batchv1.Job) bool { for _, condition := range job.Status.Conditions { if (condition.Type == batchv1.JobComplete || condition.Type == batchv1.JobFailed) && condition.Status == corev1.ConditionTrue { return true } } return false } // IsJobSuccessful returns true if the job completed successfully func (m *JobManager) IsJobSuccessful(job *batchv1.Job) bool { for _, condition := range job.Status.Conditions { if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue { return true } } return false } // IsJobFailed returns true if the job failed func (m *JobManager) IsJobFailed(job *batchv1.Job) bool { for _, condition := range job.Status.Conditions { if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { return true } } return false } // GetJobFailureReason returns the reason for job failure func (m *JobManager) GetJobFailureReason(job *batchv1.Job) string { for _, condition := range job.Status.Conditions { if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { return condition.Message } } return "Unknown failure reason" } // CountActiveJobs returns the number of currently active scan jobs func (m *JobManager) CountActiveJobs(ctx context.Context) (int, error) { jobList := &batchv1.JobList{} err := m.List(ctx, jobList, client.MatchingLabels{ LabelManagedBy: "nuclei-operator", LabelComponent: "scanner", }) if err != nil { return 0, err } count := 0 for _, job := range jobList.Items { if job.Status.Active > 0 { count++ } } return count, nil } // AtCapacity returns true if the maximum number of concurrent jobs has been reached func (m *JobManager) AtCapacity(ctx context.Context) (bool, error) { count, err := m.CountActiveJobs(ctx) if err != nil { return false, err } return count >= m.Config.MaxConcurrent, nil } // CleanupOrphanedJobs removes jobs that no longer have an associated NucleiScan func (m *JobManager) CleanupOrphanedJobs(ctx context.Context) error { logger := log.FromContext(ctx) jobList := &batchv1.JobList{} err := m.List(ctx, jobList, client.MatchingLabels{ LabelManagedBy: "nuclei-operator", LabelComponent: "scanner", }) if err != nil { return err } for _, job := range jobList.Items { // Check if owner reference exists and the owner still exists ownerRef := metav1.GetControllerOf(&job) if ownerRef == nil { logger.Info("Deleting orphaned job without owner", "job", job.Name, "namespace", job.Namespace) if err := m.DeleteJob(ctx, job.Name, job.Namespace); err != nil && !apierrors.IsNotFound(err) { logger.Error(err, "Failed to delete orphaned job", "job", job.Name) } continue } // Check if the job is stuck (running longer than 2x the timeout) if job.Status.StartTime != nil { maxDuration := 2 * m.Config.DefaultTimeout if time.Since(job.Status.StartTime.Time) > maxDuration && job.Status.Active > 0 { logger.Info("Deleting stuck job", "job", job.Name, "namespace", job.Namespace, "age", time.Since(job.Status.StartTime.Time)) if err := m.DeleteJob(ctx, job.Name, job.Namespace); err != nil && !apierrors.IsNotFound(err) { logger.Error(err, "Failed to delete stuck job", "job", job.Name) } } } } return nil } // buildJob creates a Job specification for the given NucleiScan func (m *JobManager) buildJob(scan *nucleiv1alpha1.NucleiScan) *batchv1.Job { // Generate a unique job name jobName := fmt.Sprintf("nucleiscan-%s-%d", scan.Name, time.Now().Unix()) if len(jobName) > 63 { jobName = jobName[:63] } // Determine the scanner image image := m.Config.ScannerImage if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Image != "" { image = scan.Spec.ScannerConfig.Image } // Determine timeout timeout := m.Config.DefaultTimeout if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Timeout != nil { timeout = scan.Spec.ScannerConfig.Timeout.Duration } activeDeadlineSeconds := int64(timeout.Seconds()) // Determine resources resources := m.Config.DefaultResources if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Resources != nil { resources = *scan.Spec.ScannerConfig.Resources } // Build command arguments for scanner mode args := []string{ "--mode=scanner", fmt.Sprintf("--scan-name=%s", scan.Name), fmt.Sprintf("--scan-namespace=%s", scan.Namespace), } // Build labels labels := map[string]string{ LabelManagedBy: "nuclei-operator", LabelComponent: "scanner", LabelScanName: scan.Name, LabelScanNamespace: scan.Namespace, } // Build node selector var nodeSelector map[string]string if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.NodeSelector != nil { nodeSelector = scan.Spec.ScannerConfig.NodeSelector } // Build tolerations var tolerations []corev1.Toleration if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Tolerations != nil { tolerations = scan.Spec.ScannerConfig.Tolerations } job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: jobName, Namespace: scan.Namespace, Labels: labels, }, Spec: batchv1.JobSpec{ TTLSecondsAfterFinished: ptr.To(m.Config.TTLAfterFinished), BackoffLimit: ptr.To(m.Config.BackoffLimit), ActiveDeadlineSeconds: &activeDeadlineSeconds, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, }, Spec: corev1.PodSpec{ RestartPolicy: corev1.RestartPolicyNever, ServiceAccountName: m.Config.ServiceAccountName, NodeSelector: nodeSelector, Tolerations: tolerations, SecurityContext: &corev1.PodSecurityContext{ RunAsNonRoot: ptr.To(true), RunAsUser: ptr.To(int64(65532)), RunAsGroup: ptr.To(int64(65532)), FSGroup: ptr.To(int64(65532)), SeccompProfile: &corev1.SeccompProfile{ Type: corev1.SeccompProfileTypeRuntimeDefault, }, }, Containers: []corev1.Container{ { Name: "scanner", Image: image, Args: args, Resources: resources, SecurityContext: &corev1.SecurityContext{ AllowPrivilegeEscalation: ptr.To(false), ReadOnlyRootFilesystem: ptr.To(false), // Nuclei needs temp files Capabilities: &corev1.Capabilities{ Drop: []corev1.Capability{"ALL"}, }, }, Env: []corev1.EnvVar{ { Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{ FieldRef: &corev1.ObjectFieldSelector{ FieldPath: "metadata.name", }, }, }, { Name: "POD_NAMESPACE", ValueFrom: &corev1.EnvVarSource{ FieldRef: &corev1.ObjectFieldSelector{ FieldPath: "metadata.namespace", }, }, }, }, }, }, }, }, }, } return job }