updates

docs
2026-02-08 01:36:28 +01:00 · 2025-09-05 11:22:58 +02:00 · 2025-09-05 08:56:04 +02:00
12 changed files with 969 additions and 12 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -0,0 +1,3 @@
+TODO:
+* Set location provisioner path permissions
+* Limit postgres connections in reconciler
--- a/charts/monitoring/Chart.yaml
+++ b/charts/monitoring/Chart.yaml
@@ -0,0 +1,3 @@
+apiVersion: v2
+version: 1.0.0
+name: monitoring
--- a/charts/monitoring/templates/falco.yaml
+++ b/charts/monitoring/templates/falco.yaml
@@ -0,0 +1,25 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: '{{ .Release.Name }}-falco'
+spec:
+  interval: 1h
+  url: https://falcosecurity.github.io/charts
+
+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: '{{ .Release.Name }}-falco'
+spec:
+  chart:
+    spec:
+      chart: falco
+      reconcileStrategy: ChartVersion
+      sourceRef:
+        apiVersion: source.toolkit.fluxcd.io/v1
+        kind: HelmRepository
+        name: '{{ .Release.Name }}-falco'
+        namespace: '{{ .Release.Namespace }}'
+  interval: 1h
+  values: {}
--- a/charts/monitoring/templates/kube-prometheus-stack.yaml
+++ b/charts/monitoring/templates/kube-prometheus-stack.yaml
@@ -0,0 +1,51 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: '{{ .Release.Name }}-prometheus-community'
+spec:
+  interval: 1h
+  url: https://prometheus-community.github.io/helm-charts/
+
+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: '{{ .Release.Name }}-prometheus-community'
+spec:
+  chart:
+    spec:
+      chart: kube-prometheus-stack
+      reconcileStrategy: ChartVersion
+      sourceRef:
+        apiVersion: source.toolkit.fluxcd.io/v1
+        kind: HelmRepository
+        name: '{{ .Release.Name }}-prometheus-community'
+        namespace: '{{ .Release.Namespace }}'
+  interval: 1h
+  values: {}
+
+---
+apiVersion: homelab.mortenolsen.pro/v1
+kind: HttpService
+metadata:
+  name: '{{ .Release.Name }}-prometheus-community'
+spec:
+  environment: '{{ .Values.globals.environment }}'
+  subdomain: '{{ .Values.graphana.subdomain }}'
+  destination:
+    host: '{{ .Release.Name }}-prometheus-community-grafana.{{ .Release.Namespace }}.svc.cluster.local'
+    port:
+      number: 80
+
+---
+apiVersion: homelab.mortenolsen.pro/v1
+kind: HttpService
+metadata:
+  name: '{{ .Release.Name }}-prometheus-community-alertmanager'
+spec:
+  environment: '{{ .Values.globals.environment }}'
+  subdomain: '{{ .Values.graphana.subdomain }}'
+  destination:
+    host: '{{ .Release.Name }}-prometheus-community-alertmanager.{{ .Release.Namespace }}.svc.cluster.local'
+    port:
+      number: 9093
--- a/charts/monitoring/templates/kyverno.yaml
+++ b/charts/monitoring/templates/kyverno.yaml
@@ -0,0 +1,25 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: '{{ .Release.Name }}-kyverno'
+spec:
+  interval: 1h
+  url: https://kyverno.github.io/kyverno/
+
+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: '{{ .Release.Name }}-kyverno'
+spec:
+  chart:
+    spec:
+      chart: kyverno
+      reconcileStrategy: ChartVersion
+      sourceRef:
+        apiVersion: source.toolkit.fluxcd.io/v1
+        kind: HelmRepository
+        name: '{{ .Release.Name }}-kyverno'
+        namespace: '{{ .Release.Namespace }}'
+  interval: 1h
+  values: {}
--- a/charts/monitoring/templates/loki.yaml
+++ b/charts/monitoring/templates/loki.yaml
@@ -0,0 +1,121 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: '{{ .Release.Name }}-loki'
+spec:
+  interval: 1h
+  url: https://grafana.github.io/helm-charts
+
+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: '{{ .Release.Name }}-loki'
+spec:
+  chart:
+    spec:
+      chart: loki
+      reconcileStrategy: ChartVersion
+      sourceRef:
+        apiVersion: source.toolkit.fluxcd.io/v1
+        kind: HelmRepository
+        name: '{{ .Release.Name }}-loki'
+        namespace: '{{ .Release.Namespace }}'
+  interval: 1h
+  values:
+    deploymentMode: SingleBinary
+    loki:
+      auth_enabled: false
+      server:
+        http_listen_port: 3100
+
+      memberlist:
+        join_members:
+          - loki-memberlist
+
+      schemaConfig:
+        configs:
+          - from: 2020-05-15
+            store: tsdb
+            object_store: filesystem
+            schema: v13
+            index:
+              prefix: index_
+              period: 24h
+
+      storage:
+        type: filesystem
+
+      storage_config:
+        filesystem:
+          directory: /loki/chunks
+
+      limits_config:
+        reject_old_samples: true
+        reject_old_samples_max_age: 168h
+        max_cache_freshness_per_query: 10m
+        split_queries_by_interval: 15m
+        volume_enabled: true
+
+      common:
+        path_prefix: /loki
+        storage:
+          filesystem:
+            chunks_directory: /loki/chunks
+            rules_directory: /loki/rules
+        replication_factor: 1
+        ring:
+          instance_addr: 127.0.0.1
+          kvstore:
+            store: inmemory
+
+    # Enable persistent storage
+    singleBinary:
+      persistence:
+        enabled: true
+        size: 10Gi
+        storageClass: '{{ .Values.globals.environment }}' # Uses default storage class
+      extraVolumeMounts:
+        - name: storage
+          mountPath: /loki
+
+    backend:
+      replicas: 0
+    read:
+      replicas: 0
+    write:
+      replicas: 0
+
+    ingester:
+      replicas: 0
+    querier:
+      replicas: 0
+    queryFrontend:
+      replicas: 0
+    queryScheduler:
+      replicas: 0
+    distributor:
+      replicas: 0
+    compactor:
+      replicas: 0
+    indexGateway:
+      replicas: 0
+    bloomCompactor:
+      replicas: 0
+    bloomGateway:
+      replicas: 0
+    promtail:
+      enabled: true
+      config:
+        snippets:
+          extraScrapeConfigs: |
+            - job_name: kubernetes-pods
+              kubernetes_sd_configs:
+                - role: pod
+              relabel_configs:
+                - source_labels: ["__meta_kubernetes_pod_container_name"]
+                  target_label: "container"
+                - source_labels: ["__meta_kubernetes_pod_name"]
+                  target_label: "pod"
+                - source_labels: ["__meta_kubernetes_pod_namespace"]
+                  target_label: "namespace"
--- a/charts/monitoring/templates/trivy.yaml
+++ b/charts/monitoring/templates/trivy.yaml
@@ -0,0 +1,25 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: '{{ .Release.Name }}-aqua'
+spec:
+  interval: 1h
+  url: https://aquasecurity.github.io/helm-charts/
+
+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: '{{ .Release.Name }}-aqua'
+spec:
+  chart:
+    spec:
+      chart: trivy-operator
+      reconcileStrategy: ChartVersion
+      sourceRef:
+        apiVersion: source.toolkit.fluxcd.io/v1
+        kind: HelmRepository
+        name: '{{ .Release.Name }}-aqua'
+        namespace: '{{ .Release.Namespace }}'
+  interval: 1h
+  values: {}
--- a/charts/monitoring/values.yaml
+++ b/charts/monitoring/values.yaml
@@ -0,0 +1,4 @@
+globals:
+  environment: prod
+graphana:
+  subdomain: grafana
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -0,0 +1,476 @@
+# Home Kubernetes Cluster Setup: Monitoring & Security Quickstart
+
+This guide provides a practical, lightweight setup for monitoring and security on your home Kubernetes cluster. It uses Helm for easy installation and focuses on essential features with minimal complexity.
+
+## Overview
+
+This setup includes:
+
+*   **Monitoring:** Prometheus + node-exporter + kube-state-metrics + Grafana (via the `kube-prometheus-stack` Helm chart).
+*   **Image Scanning & Supply-Chain:** Trivy (Trivy Operator) for automated in-cluster image vulnerability scanning.
+*   **Policy / Admission Control / Pod Security:** Kyverno for policy enforcement and Kubernetes Pod Security Admission (PSA) for baseline security.
+*   **Runtime Security / IDS:** Falco to detect suspicious syscalls and pod activity.
+*   **Network Segmentation:** Calico (or Cilium) CNI with basic NetworkPolicy configuration.
+*   **Ad-Hoc Checks:**  kube-bench (CIS benchmarks), kube-linter/kube-score (static analysis), and kube-hunter (penetration testing).
+
+## Prerequisites
+
+*   A functional Kubernetes cluster (managed or self-hosted).
+*   `kubectl` installed and configured to connect to your cluster.
+*   Helm v3 installed.
+
+## Installation
+
+These instructions assume you have `kubectl` and Helm set up and authenticated to your cluster.
+
+### 1. Monitoring (Prometheus + Grafana)
+
+*   Add the Prometheus community Helm repository:
+
+    ```bash
+    helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+    helm repo update
+    ```
+
+*   Create the `monitoring` namespace and install the `kube-prometheus-stack` chart:
+
+    ```bash
+    kubectl create ns monitoring
+    helm install kube-prometheus prometheus-community/kube-prometheus-stack --namespace monitoring
+    ```
+
+    *Optional*: Customize the installation by creating a `values.yaml` file to configure persistence, resource limits, and scrape intervals.  See *Configuration* below for a potential `values.yaml` you can adapt.
+
+*   Access Grafana:
+
+    ```bash
+    kubectl -n monitoring port-forward svc/kube-prometheus-grafana 3000:80
+    ```
+
+    Open `http://localhost:3000` in your browser. The default `admin` user password can be found in the chart's secrets (check the Helm chart documentation).
+
+    This provides node-exporter, kube-state-metrics, a Prometheus server, Alertmanager, and pre-built dashboards for your cluster.
+
+### 2. Image Scanning (Trivy Operator)
+
+*   Add the Aqua Security Helm repository:
+
+    ```bash
+    helm repo add aqua https://aquasecurity.github.io/helm-charts
+    helm repo update
+    ```
+
+*   Create the `trivy-system` namespace and install the `trivy-operator` chart:
+
+    ```bash
+    kubectl create ns trivy-system
+    helm install trivy-operator aqua/trivy-operator --namespace trivy-system
+    ```
+
+    Trivy Operator creates `VulnerabilityReport` and `ConfigAuditReport` CRDs.  It scans images running in the cluster for vulnerabilities.
+
+### 3. Policy Admission (Kyverno)
+
+*   Create the `kyverno` namespace and install Kyverno:
+
+    ```bash
+    kubectl create ns kyverno
+    kubectl apply -f https://github.com/kyverno/kyverno/releases/latest/download/install.yaml
+    ```
+
+*   Apply the example `ClusterPolicy` to deny privileged containers and hostPath mounts:
+
+    ```yaml
+    apiVersion: kyverno.io/v1
+    kind: ClusterPolicy
+    metadata:
+      name: deny-privileged-and-hostpath
+    spec:
+      rules:
+      - name: deny-privileged
+        match:
+          resources:
+            kinds: ["Pod","PodTemplate","CronJob","Job","Deployment","StatefulSet"]
+        validate:
+          message: "Privileged containers are not allowed"
+          deny:
+            conditions:
+            - key: "{{ request.object.spec.containers[].securityContext.privileged }}"
+              operator: Equals
+              value: true
+      - name: deny-hostpath
+        match:
+          resources:
+            kinds: ["Pod","PodTemplate","Deployment","StatefulSet"]
+        validate:
+          message: "hostPath volumes are not allowed"
+          pattern:
+            spec:
+              volumes:
+              - "*":
+                  hostPath: null
+    ```
+
+    Save the above as `kyverno-policy.yaml` and apply it:
+
+    ```bash
+    kubectl apply -f kyverno-policy.yaml
+    ```
+
+    Adapt the `match` section to target specific workload types.  See *Example Kyverno Policy* below.
+
+### 4. Pod Security Admission (PSA)
+
+*   Apply the `baseline` Pod Security Standard to the `default` namespace:
+
+    ```bash
+    kubectl label ns default pod-security.kubernetes.io/enforce=baseline
+    ```
+
+*   For a stricter security posture, use the `restricted` profile:
+
+    ```bash
+    kubectl label ns default pod-security.kubernetes.io/enforce=restricted
+    ```
+
+    PSA provides controls like preventing privileged containers and restricting host networking.
+
+### 5. Runtime Detection (Falco)
+
+*   Add the Falco Helm repository:
+
+    ```bash
+    helm repo add falcosecurity https://falcosecurity.github.io/charts
+    helm repo update
+    ```
+
+*   Create the `falco` namespace and install the `falco` chart:
+
+    ```bash
+    kubectl create ns falco
+    helm install falco falcosecurity/falco --namespace falco
+    ```
+
+    Falco detects suspicious container behavior and system calls.
+
+### 6. Network Policy & CNI
+
+*   If you haven't already, install a CNI that supports NetworkPolicy, such as Calico:
+
+    ```bash
+    kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml
+    ```
+
+    Alternatively, consider Cilium.
+
+*   Implement a default-deny NetworkPolicy:
+
+    ```yaml
+    apiVersion: networking.k8s.io/v1
+    kind: NetworkPolicy
+    metadata:
+      name: default-deny
+      namespace: my-namespace
+    spec:
+      podSelector: {}
+      policyTypes:
+      - Ingress
+      - Egress
+    ```
+
+    Save the above as `default-deny.yaml` and apply it to your namespace:
+
+    ```bash
+    kubectl apply -f default-deny.yaml
+    ```
+
+    Follow this up with explicit `allow` policies for necessary services.
+
+### 7. Cluster Hardening & Scans
+
+*   **kube-bench (CIS Benchmarks):**
+
+    ```bash
+    kubectl run --rm -it --image aquasec/kube-bench:latest kube-bench -- /kube-bench --version 1.23
+    ```
+
+    Refer to the kube-bench documentation for running as a Job or Pod.
+
+*   **kube-linter / kube-score (Static Manifest Checks):**
+
+    Install the CLI tool locally and analyze your Kubernetes manifests.
+
+*   **kube-hunter (Penetration Testing):**
+
+    ```bash
+    docker run aquasec/kube-hunter:latest --remote <K8S_API_ENDPOINT>
+    ```
+
+## Configuration
+
+This section provides example configuration files and tips to customize the setup for a home Kubernetes cluster.
+
+### Example `values.yaml` for `kube-prometheus-stack`
+
+This reduces resource usage and avoids the need for external object storage for Alertmanager, which is not needed at home.  It disables default dashboards you might not need initially and cuts down some Prometheus retention.
+
+```yaml
+# values.yaml for kube-prometheus-stack
+
+prometheus:
+  prometheusSpec:
+    # reduce resource rqts / limits
+    resources:
+       requests:
+         memory: 1Gi
+         cpu: 200m
+       limits:
+         memory: 2Gi
+         cpu: 500m
+
+    # Reduce storage retention
+    retention: 7d
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          storageClassName: "local-path" # Or your storage class
+          accessModes: ["ReadWriteOnce"]
+          resources:
+            requests:
+              storage: 10Gi # adjust as needed
+
+alertmanager:
+  enabled: false # for quick home setup, send directly to telegram etc.
+grafana:
+  enabled: true
+  defaultDashboardsEnabled: false   # Disable default dashboards
+  sidecar:
+    dashboards:
+      enabled: true
+      provider:
+        folders:
+          fromConfigMap: true # Load custom dashboards from ConfigMaps
+
+kube-state-metrics:
+ enabled: true
+
+nodeExporter:
+  enabled: true
+```
+
+To use this configuration, save it as `values.yaml` and run:
+
+```bash
+helm install kube-prometheus prometheus-community/kube-prometheus-stack --namespace monitoring -f values.yaml
+```
+
+Adapt the `storageClassName` and storage amounts to your environment.
+
+### Example Kyverno Policy - Disallow Root User / Require Distroless
+
+This example expands on the previous policy.  It requires images not run as UID 0 and suggests distroless images.  It still requires privilege escalation to be forbidden:
+
+```yaml
+apiVersion: kyverno.io/v1
+kind: ClusterPolicy
+metadata:
+  name: require-non-root-user-and-distroless
+  annotations:
+    policies.kyverno.io/title: Require Non-Root User and Distroless Images
+    policies.kyverno.io/category: Security
+    policies.kyverno.io/severity: medium
+    policies.kyverno.io/subject: Pod
+    policies.kyverno.io/description: >-
+      Containers should not run as root, and ideally, be based on Distroless
+      images where possible. This policy requires that containers define
+      `runAsUser`, and that `runAsUser` is not `0`.  It also generates a warning
+      if the image is not based on a distroless image, although does not reject
+      the deployment.
+
+spec:
+  validationFailureAction: Enforce
+  rules:
+    - name: check-runasnonroot
+      match:
+        any:
+          - resources:
+              kinds:
+                - Pod
+      validate:
+        message: "Containers must not run as root. Specify a non-zero runAsUser in securityContext."
+        pattern:
+          spec:
+            containers:
+              - securityContext:
+                  runAsUser: "!0" # not equal to zero
+    - name: check-allowprivilegeescalation
+      match:
+        any:
+          - resources:
+              kinds:
+                - Pod
+      validate:
+        message: "Containers must set allowPrivilegeEscalation to false."
+        pattern:
+          spec:
+            containers:
+              - securityContext:
+                  allowPrivilegeEscalation: "false"
+    - name: warn-distroless
+      match:
+        any:
+          - resources:
+              kinds:
+                - Pod
+      verifyImages:
+        - imageReferences:
+            - "*"  # all images
+          attestations:
+            - policy:
+                subjects:
+                  - name: distroless
+                conditions:
+                  all:
+                    - key: "ghcr.io/distroless/static:latest"  # Example -  Check if the image is distroless.  You can use wildcards
+                      operator: In
+                      value: "{{ image.repoDigests }}"
+                  # You can add other keys and values to check
+
+      mutate:
+        overlay:
+          metadata:
+            annotations:
+              "image.distroless.warn": "This image isn't distroless -- see https://github.com/GoogleContainerTools/distroless"
+```
+
+### Alertmanager to Telegram
+
+1.  **Create a Telegram Bot:** Search for `@BotFather` on Telegram. Use the `/newbot` command. Give your bot a name and a unique username.  BotFather will give you the bot's API token.
+
+2.  **Get your Telegram Chat ID:** Send a message to your bot.  Then, in a browser, go to  `https://api.telegram.org/bot<YOUR_BOT_API_TOKEN>/getUpdates` (replace `<YOUR_BOT_API_TOKEN>`). The `chat.id` value in the JSON response is your chat ID.
+
+3.  **Create a Secret in Kubernetes:**
+
+    ```bash
+    kubectl create secret generic telegram-secrets \
+      --from-literal=bot_token="<YOUR_BOT_API_TOKEN>" \
+      --from-literal=chat_id="<YOUR_CHAT_ID>"
+    ```
+
+    Replace the placeholders with the correct values.
+
+4.  **Add Alertmanager Configuration:**
+
+    You'll need to patch the default Alertmanager configuration provided by `kube-prometheus-stack`.  Because we disabled the Alertmanager component from the chart for simplicitly's sake, we'll instead rely on defining an additional prometheusRule that sends alerts to a webhook (and have a small sidecar container forward them to telegram).
+
+Example:
+
+```yaml
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    prometheus: k8s
+    role: alert-rules
+  name: promethus-to-telegram
+  namespace: monitoring
+spec:
+  groups:
+    - name: kubernetes-home-cluster
+      rules:
+        - alert: PrometheusToTelegramAlert
+          annotations:
+            description: 'Alert sent from Prometheus goes to telegram'
+          expr: vector(1)
+          labels:
+            severity: critical
+          for: 1s
+          actions:
+            - name: SendToTelegramAction
+              url: 'http://localhost:8080/message'
+              parameters:
+                text: Alert from Prometheus: {{ .Alerts.Firing | len }} firing alert{{ if gt (len .Alerts.Firing) 1 }}s{{ end }}.\nSeverity: {{ .CommonLabels.severity }}\nDescription: {{ .CommonAnnotations.description }}
+```
+
+Now you will create a deployment that runs a small webhook server forwarding these alerts to telegram:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus-telegram
+  namespace: monitoring
+spec:
+  selector:
+    matchLabels:
+      app: prometheus-telegram
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: prometheus-telegram
+    spec:
+      containers:
+      - name: webhook
+        image: nginx
+        ports:
+        - containerPort: 8080
+      - name: telegram-forwarder
+        image: alpine/curl
+        command: ["/bin/sh"]
+        args:
+        - "-c"
+        - |
+          while true; do
+            nc -l -p 8080 | sed 's/text=/text=Alert from Prometheus: /g' | curl -sS --fail -X POST "https://api.telegram.org/bot$(TELEGRAM_BOT_TOKEN)/sendMessage" -d chat_id=$(TELEGRAM_CHAT_ID) -d "$$(cat)"
+            sleep 1;
+          done
+        env:
+        - name: TELEGRAM_BOT_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: telegram-secrets
+              key: bot_token
+        - name: TELEGRAM_CHAT_ID
+          valueFrom:
+            secretKeyRef:
+              name: telegram-secrets
+              key: chat_id
+```
+
+**Explanation:**
+
+*   It creates an Nginx pod for a HTTP listener to avoid unnecessary security errors in Promethues,
+*   The `telegram-forwarder` container uses `curl` and `nc` to forward the POST from Prometheus to the Telegram API, using the secrets for authentication.
+
+## Operational Tips
+
+*   **Resource Management:**  Set resource limits and requests for components, especially Prometheus and Grafana. Adjust scrape intervals for Prometheus to reduce load.
+*   **Persistence:**  Use persistent volumes for Grafana and Prometheus to preserve dashboards and historical data.
+*   **Alerting:**  Configure Alertmanager with a Telegram or Discord webhook for notifications. This is *simpler* than email for home setups.
+*   **Trivy & Image Blocking:** To automatically block vulnerable images, integrate Trivy with admission webhooks (using Kyverno to reject deployments based on Trivy reports).
+*   **Backups:** Regularly back up etcd (if self-hosting the control plane) and potentially Prometheus/Grafana data.
+
+## Getting Started Quickly
+
+Follow this installation order:
+
+1.  Install your `CNI`.
+2.  Install `kube-prometheus-stack`, using `values.yaml` to reduce resources.
+3.  Install Grafana and import dashboards.
+4.  Enable PSA on namespaces.
+5.  Install Kyverno and create deny policies.
+6.  Install Trivy Operator for image scanning visibility.
+7.  Install Falco for runtime detection.
+8.  Run `kube-bench` and `kube-linter` for initial assessment.
+
+## Useful Resources
+
+*   [kube-prometheus-stack (Helm)](https://github.com/prometheus-community/helm-charts)
+*   [trivy-operator](https://github.com/aquasecurity/trivy-operator)
+*   [Kyverno](https://kyverno.io/)
+*   [Falco](https://falco.org/)
+*   [Calico CNI](https://www.tigera.io/project-calico/)
+*  [Aqua kube-hunter, kube-bench, kube-linter](https://www.aquasec.com/)
+
+This README provides a solid foundation for setting up monitoring and security on your home Kubernetes cluster.  Adapt the configurations and policies to your specific needs and experiment!
--- a/docs/prepare-server.md
+++ b/docs/prepare-server.md
@@ -0,0 +1,216 @@
+Here's the guide formatted as a `README.md` file, ready for a GitHub repository or local documentation.
+
+```markdown
+# Optimizing Debian for K3s
+
+This guide outlines steps to optimize a Debian server for running K3s (Lightweight Kubernetes). Optimization involves a combination of general Linux best practices, K3s-specific recommendations, and considerations for your specific workload.
+
+## Table of Contents
+
+- [1. Debian Base System Optimization](#1-debian-base-system-optimization)
+  - [a. Kernel Parameters (sysctl.conf)](#a-kernel-parameters-sysctlconf)
+  - [b. User Limits (ulimit)](#b-user-limits-ulimit)
+  - [c. Disable Unnecessary Services](#c-disable-unnecessary-services)
+  - [d. Update System](#d-update-system)
+  - [e. Swap Configuration](#e-swap-configuration)
+- [2. K3s Specific Optimizations](#2-k3s-specific-optimizations)
+  - [a. Choose a Performant Storage Backend](#a-choose-a-performant-storage-backend)
+  - [b. Containerd Tuning](#b-containerd-tuning)
+  - [c. K3s Server and Agent Configuration](#c-k3s-server-and-agent-configuration)
+  - [d. CNI Choice](#d-cni-choice)
+- [3. General Server Best Practices](#3-general-server-best-practices)
+  - [a. Fast Storage](#a-fast-storage)
+  - [b. Adequate RAM and CPU](#b-adequate-ram-and-cpu)
+  - [c. Network Configuration](#c-network-configuration)
+  - [d. Monitoring](#d-monitoring)
+  - [e. Logging](#e-logging)
+- [4. Post-Optimization Verification](#4-post-optimization-verification)
+
+---
+
+## 1. Debian Base System Optimization
+
+These steps are generally beneficial for any server, but particularly important for containerized environments like K3s.
+
+### a. Kernel Parameters (sysctl.conf)
+
+Edit `/etc/sysctl.conf` and apply changes with `sudo sysctl -p`.
+
+```ini
+# Increase maximum open files (for container processes, K3s components)
+fs.inotify.max_user_watches = 524288 # For fs-based operations within containers
+fs.inotify.max_user_instances = 8192 # For fs-based operations within containers
+fs.file-max = 2097152 # Increase overall system file handle limit
+
+# Increase limits for network connections
+net.core.somaxconn = 65535 # Max backlog of pending connections
+net.ipv4.tcp_tw_reuse = 1 # Allow reuse of TIME_WAIT sockets (caution: can sometimes mask issues)
+net.ipv4.tcp_fin_timeout = 30 # Reduce TIME_WAIT duration
+net.ipv4.tcp_max_syn_backlog = 65535 # Max number of remembered connection requests
+net.ipv4.tcp_keepalive_time=600 # Shorter keepalive interval
+net.ipv4.tcp_keepalive_intvl=60 # Keepalive interval
+net.ipv4.tcp_keepalive_probes=3 # Keepalive probes
+
+# Increase memory limits for network buffers (especially if high network traffic)
+net.core.rmem_max = 26214400
+net.core.wmem_max = 26214400
+net.core.rmem_default = 26214400
+net.core.wmem_default = 26214400
+
+# Other useful parameters
+vm.max_map_count = 262144 # Essential for Elasticsearch, MongoDB, etc.
+vm.dirty_ratio = 5 # Reduce dirty page percentage for better write performance
+vm.dirty_background_ratio = 10 # Reduce dirty page percentage for better write performance
+kernel.pid_max = 4194304 # Increase max PIDs
+```
+
+**Explanation:**
+- `fs.file-max`: K3s and its deployed containers can open a large number of files. Increasing this prevents "Too many open files" errors.
+- `net.*`: These parameters help in handling a high number of concurrent network connections crucial for a Kubernetes cluster.
+- `vm.max_map_count`: Required by some applications that run on Kubernetes (e.g., Elasticsearch).
+
+### b. User Limits (ulimit)
+
+Edit `/etc/security/limits.conf` (or create a file like `/etc/security/limits.d/k3s.conf`) for all users, or specifically for the user K3s runs as (often `root` by default or a dedicated `k3s` user).
+
+```
+# For all users (or a specific k3s user if you configure it)
+*    soft nofile 65536
+*    hard nofile 131072
+*    soft nproc  65536
+*    hard nproc  131072
+```
+**Note:** A reboot or logging out/in is often required for these changes to take effect for user sessions. Services typically pick up new limits upon restart.
+
+**Explanation:**
+- `nofile` (number of open files): Sets the per-user/per-process limit. K3s and pods need a high limit.
+- `nproc` (number of processes): Each container consumes processes. A high limit prevents hitting a ceiling.
+
+### c. Disable Unnecessary Services
+
+Reducing background services frees up CPU, RAM, and I/O.
+```bash
+sudo systemctl disable --now apache2 # Example, replace with actual unused services
+sudo systemctl disable --now nginx    # Example
+sudo systemctl disable --now cups     # If not using printing
+sudo systemctl disable --now modemmanager # If not using a modem
+sudo systemctl disable --now bluetooth # If no bluetooth devices
+# Review active services using:
+# systemctl list-unit-files --type=service --state=enabled
+```
+
+### d. Update System
+
+Keep your system packages up-to-date for security and performance bug fixes.
+```bash
+sudo apt update
+sudo apt upgrade -y
+sudo apt dist-upgrade -y # For major version changes (if applicable)
+sudo apt autoremove -y
+sudo reboot # After significant kernel or base system updates
+```
+
+### e. Swap Configuration
+
+**It is generally recommended to disable swap on K3s nodes, especially worker nodes.** Swapping can severely degrade performance in containerized environments due to unpredictable latency.
+
+If you absolutely must have swap (e.g., very low memory server, not recommended for production):
+*   Reduce swappiness: `sudo sysctl vm.swappiness=10` (or even `1`). Add `vm.swappiness = 10` to `/etc/sysctl.conf`.
+*   Preferably, disable swap entirely if you have sufficient RAM:
+    ```bash
+    sudo swapoff -a
+    sudo sed -i '/ swap / s/^/#/' /etc/fstab
+    ```
+    **WARNING:** Only disable swap if your system has sufficient RAM to handle its workload without it. If nodes run out of memory without swap, processes will be OOM-killed, leading to instability.
+
+## 2. K3s Specific Optimizations
+
+### a. Choose a Performant Storage Backend
+
+The choice of K3s's data store significantly impacts performance and availability.
+
+*   **SQLite (Default):** Good for single-node setups or small, non-critical clusters. Performance can degrade with many changes or large clusters.
+*   **External Database (MariaDB/MySQL, PostgreSQL):**
+    *   **Recommended for Production:** Offers high availability and better performance than embedded SQLite for multi-node K3s server configurations.
+    *   **Placement:** Place the external database on a separate server or on a dedicated, fast storage volume.
+*   **External etcd:** Offers the best performance and scalability, but is more complex to manage and requires its own dedicated etcd cluster.
+
+### b. Containerd Tuning
+
+K3s uses containerd as its container runtime.
+
+*   **Fast Storage for Containerd:** Ensure the directories where containerd stores its data are on fast storage (NVMe SSDs are ideal).
+    *   `/var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs` (K3s specific)
+    *   (`/var/lib/containerd` if using a standalone containerd setup)
+    This is critical for image pulls, container startup, and overlayfs performance.
+
+### c. K3s Server and Agent Configuration
+
+Configure K3s using a configuration file (e.g., `/etc/rancher/k3s/config.yaml`) or command-line flags.
+
+*   **Disable Unused Components:** Reduce resource consumption by disabling features you don't need.
+    *   `--disable traefik`: If using Nginx Ingress Controller or another ingress.
+    *   `--disable servicelb`: If using a cloud provider Load Balancer, MetalLB, or another solution.
+    *   `--disable local-storage`: If using cloud provider storage, NFS, or another remote storage solution.
+    *   `--disable metrics-server`: If using a different metrics solution or don't need it.
+    *   `--disable helm-controller`: If exclusively using `kubectl` for deployments.
+
+    **Example `/etc/rancher/k3s/config.yaml` for a server node:**
+    ```yaml
+    # /etc/rancher/k3s/config.yaml
+    server: true
+    disable:
+      - traefik
+      - servicelb
+      - local-storage
+      - metrics-server
+    # Example for external database
+    # datastore-endpoint: "mysql://k3s:password@tcp(db-server:3306)/kube?parseTime=true"
+    ```
+
+### d. CNI Choice
+
+K3s defaults to Flannel (with VXLAN), which is performant for many use cases.
+*   **Alternative CNIs (Calico, Cilium):** If you require advanced network policies, superior performance in high-throughput scenarios, or specific networking features, consider replacing Flannel. These can offer better raw throughput or lower latency but add complexity.
+    *   If installing K3s, you'd typically skip Flannel installation (`--flannel-backend=none`) then install your chosen CNI.
+    *   Ensure your chosen CNI is optimized with the correct kernel modules and sysctls.
+
+## 3. General Server Best Practices
+
+### a. Fast Storage
+
+*   **SSD/NVMe:** Absolutely crucial for K3s performance, especially for the K3s data directory (`$K3S_DATA_DIR`, default: `/var/lib/rancher/k3s`), `/var/lib/containerd`, and the operating system itself. Pod startup times, image pulls, and database operations are heavily I/O bound.
+*   **RAID:** If using multiple drives, consider RAID1 or RAID10 for redundancy and increased I/O performance.
+
+### b. Adequate RAM and CPU
+
+*   **RAM:** K3s servers (especially with embedded SQLite) require more RAM. Worker nodes also need ample RAM for their pods. Err on the side of more RAM.
+*   **CPU:** Ensure sufficient CPU cores for K3s components, containers, and your workloads.
+
+### c. Network Configuration
+
+*   **Gigabit Ethernet (at least):** 10Gbps or faster is ideal for larger clusters or high-bandwidth applications.
+*   **MTU:** Ensure consistent MTU settings across all nodes and your network infrastructure. K3s default CNI (Flannel VXLAN) might use a smaller MTU (e.g., 1450) due to encapsulation overhead. Misconfigured MTU can lead to packet fragmentation and performance issues.
+*   **Jumbo Frames:** If your network supports it and all components are configured for it, jumbo frames (e.g., 9000 bytes MTU) can reduce overhead and improve throughput, but requires careful and consistent configuration.
+
+### d. Monitoring
+
+*   **Prometheus/Grafana:** Essential for monitoring resource usage (CPU, RAM, disk I/O, network) of your nodes and K3s components. This helps identify and diagnose bottlenecks.
+*   **Kube-state-metrics:** Provides metrics about Kubernetes objects.
+*   **Node Exporter:** Provides system-level metrics.
+*   **cAdvisor (usually bundled with container runtimes):** Provides container-level metrics.
+
+### e. Logging
+
+*   **Centralized Logging (ELK Stack, Loki, etc.):** Stream logs from K3s components and pods to a central logging system for easier debugging, troubleshooting, and performance analysis.
+
+## 4. Post-Optimization Verification
+
+1.  **Reboot:** After making changes to kernel parameters or `limits.conf`, a full system reboot is often the safest way to ensure all changes are fully applied.
+2.  **Verify sysctl settings:** `sudo sysctl -a | grep -i <parameter_name>` (e.g., `sudo sysctl -a | grep -i fs.file-max`)
+3.  **Verify ulimits:** Check `ulimit -n` and `ulimit -u` in a new shell. For specific running processes, inspect `/proc/<pid>/limits`.
+4.  **Monitor Performance:** Use tools like `htop`, `iostat`, `netstat`, `dstat`, and your installed monitoring stack (Prometheus/Grafana) to observe the impact of your changes. Look for reduced CPU usage, lower I/O wait, improved network throughput, and stable memory usage.
+5.  **Test Workloads:** Deploy your actual applications and perform load testing to ensure the optimizations yield the desired performance benefits under realistic conditions.
+
+By diligently following these steps, you can establish a robust and highly performant Debian environment for your K3s cluster. Always test changes in a staging or development environment before applying them to production systems.
+```
--- a/src/resources/homelab/postgres-database/postgres-database.ts
+++ b/src/resources/homelab/postgres-database/postgres-database.ts
@@ -124,19 +124,23 @@ class PostgresDatabase extends CustomResource<typeof specSchema> {
      user: clusterSecret.user,
      password: clusterSecret.password,
    });
-    const connectionError = await database.ping();
-    if (connectionError) {
-      console.error('Failed to connect', connectionError);
-      throw new NotReadyError('FailedToConnectToDatabase');
+    try {
+      const connectionError = await database.ping();
+      if (connectionError) {
+        console.error('Failed to connect', connectionError);
+        throw new NotReadyError('FailedToConnectToDatabase');
+      }
+      await database.upsertRole({
+        name: secret.user,
+        password: secret.password,
+      });
+      await database.upsertDatabase({
+        name: secret.database,
+        owner: secret.user,
+      });
+    } finally {
+      await database.close();
    }
-    await database.upsertRole({
-      name: secret.user,
-      password: secret.password,
-    });
-    await database.upsertDatabase({
-      name: secret.database,
-      owner: secret.user,
-    });
  };
 }

--- a/src/services/postgres/postgres.instance.ts
+++ b/src/services/postgres/postgres.instance.ts
@@ -60,6 +60,10 @@ class PostgresInstance {
      await this.#db.raw(`ALTER DATABASE "${name}" OWNER TO "${owner}"`);
    }
  };
+
+  public close = async () => {
+    await this.#db.destroy();
+  };
 }

 export { PostgresInstance, type PostgresInstanceOptions };
Author	SHA1	Message	Date
Morten Olsen	ff06613e99	updates	2025-09-05 11:22:58 +02:00
Morten Olsen	9fe279b1b5	docs	2025-09-05 08:56:04 +02:00