Initial commit: document how to set up Coder on Kubernetes on OpenStack.

60925554 · Théo Zimmermann · 60925554 · 60925554 · 60925554 · 60925554
Unverified Commit 60925554 authored 6 months ago by Théo Zimmermann
--- a/.envrc
+++ b/.envrc
+use flake
--- a/.gitignore
+++ b/.gitignore
+openrc
+.direnv
+.ipynb_checkpoints
+result
+*.tar.zst
+*.zip
+*~
\ No newline at end of file
--- a/Coder-management.ipynb
+++ b/Coder-management.ipynb
--- a/Dockerfile
+++ b/Dockerfile
+FROM inf110-workspace-base:latest
+# Install the coq-lsp extension for code-server
+RUN code-server --install-extension ejgallego.coq-lsp
+# Install the Jupyter / Python extensions for code-server
+RUN code-server --install-extension ms-toolsai.jupyter
+RUN code-server --install-extension ms-python.python
+# Suppress error message when running git pull
+RUN git config --global --add safe.directory /home/coder/tp
+# Add code-server user settings
+COPY user-settings.json /home/coder/.local/share/code-server/User/settings.json
--- a/README.md
+++ b/README.md
+## Installation process
+This document describes the installation process of Coder for the INF110 labs on the R2 OpenStack cluster.
+It should be possible to adapt this to other use cases or OpenStack clusters with minor modifications.
+### Create the cluster template
+```bash
+source openrc # automatically done by the nix env / make sure to have this file
+export KEYPAIR= # your keypair
+openstack coe cluster template create \
+    --coe kubernetes \
+    --image coreos-35 \
+    --external-network $(openstack network show provider -f json | jq -r .id) \
+    --keypair $KEYPAIR \
+    --network-driver flannel \
+    --volume-driver cinder \
+    --dns-nameserver 137.194.2.16 \
+    --flavor m2.xlarge \
+    --master-flavor m1.medium \
+    --docker-storage-driver overlay2 \
+    --labels 'auto_healing_enabled=true,auto_scaling_enabled=true,min_node_count=1,max_node_count=100,boot_volume_type=LVM-NVME,etcd_volume_type=LVM-NVME,docker_volume_type=LVM-NVME' \
+    --registry-enabled \
+    --insecure-registry '10.0.0.99:5000' \
+    --server-type vm \
+    --floating-ip-disabled \
+    coder
+```
+### Create the Kubernetes cluster
+```bash
+openstack coe cluster create --cluster-template coder --node-count 1 inf110
+```
+It can happen that the cluster creation hangs. In that case, log in to the master node and check if it reports that the `etcd` service failed. If so, restart the `etcd` service and the cluster creation should continue.
+```bash
+sudo systemctl restart etcd
+```
+### Create a VM for the local Docker registry
+This registry will be used to store the Docker image of the Coder workspace.
+```bash
+openstack server create \
+    --flavor smi-worker \
+    --image coreos-35 \
+    --no-network \
+    --security-group default \
+    --security-group registry \
+    --availability-zone nova \
+    --key-name $KEYPAIR \
+    inf110-registry
+```
+Because we have set a fixed IP for the Docker insecure registry in the cluster template (`10.0.0.99:5000`), we need to assign this IP to the VM.
+```bash
+openstack server add fixed ip \
+    --fixed-ip-address 10.0.0.99 \
+    inf110-registry \
+    inf110
+```
+### Create a VM for the reverse proxy
+The reverse proxy will be used to access Coder via HTTPS.
+```bash
+openstack server create \
+    --flavor smi-worker \
+    --image ubuntu-jammy-19-09-2023 \
+    --network inf110 \
+    --security-group default \
+    --security-group ssh_icmp \
+    --security-group HTTP_HTTPS \
+    --availability-zone nova \
+    --key-name $KEYPAIR \
+    inf110-reverse-proxy
+```
+Because we have a DNS entry for the reverse proxy, we need to assign a specific floating IP (`137.194.210.143`) to it.
+```bash
+openstack server add floating ip \
+    inf110-reverse-proxy \
+    137.194.210.143
+```
+### Install the Docker registry
+Launch the Docker registry, backed by Swift, on the registry VM.
+```bash
+ssh -J ubuntu@137.194.210.143 core@10.0.0.99 sudo docker run -d \
+    -p 5000:5000 \
+    -e REGISTRY_STORAGE=swift \
+    -e REGISTRY_STORAGE_SWIFT_CONTAINER=docker_registry \
+    -e REGISTRY_STORAGE_SWIFT_AUTHURL="$OS_AUTH_URL" \
+    -e REGISTRY_STORAGE_SWIFT_USERNAME="$OS_USERNAME" \
+    -e REGISTRY_STORAGE_SWIFT_PASSWORD="$OS_PASSWORD" \
+    -e REGISTRY_STORAGE_SWIFT_REGION="$OS_REGION_NAME" \
+    -e REGISTRY_STORAGE_SWIFT_DOMAIN="$OS_USER_DOMAIN_NAME" \
+    --restart=always \
+    --name registry \
+    registry:2
+```
+### Build the Coder workspace image
+This part requires both Nix and Docker.
+```bash
+docker load -i $(nix build .#dockerContainers.inf110-workspace --print-out-paths)
+docker save $(docker build -q .) -o inf110-workspace.tar
+# Unpacking a zstd-compressed tarball is not supported by Docker on CoreOS 35
+# and compressing with gzip or xz is too slow.
+#docker save $(docker build -q .) | zstd > inf110-workspace.tar.zst
+```
+### Push the Coder workspace image to the insecure Docker registry
+Since the Docker registry is not accessible publicly, we use `scp` to copy the image to a node in the cluster and push it from there.
+```bash
+export NODE_IP=$(openstack coe cluster show inf110 -f json | jq -r .node_addresses[0])
+scp -J ubuntu@137.194.210.143 inf110-workspace.tar core@$NODE_IP:/tmp
+ssh -J ubuntu@137.194.210.143 core@$NODE_IP
+sudo -i
+docker load -i /tmp/inf110-workspace.tar
+docker push 10.0.0.99:5000/inf110-workspace:latest
+rm /tmp/inf110-workspace.tar
+```
+### Expose the Kubernetes dashboard
+For this, we need to log in to the master node.
+```bash
+export MASTER_IP=$(openstack coe cluster show inf110 -f json | jq -r .master_addresses[0])
+ssh -J ubuntu@137.194.210.143 core@$MASTER_IP
+sudo -i
+kubectl expose deployment kubernetes-dashboard --type=LoadBalancer --name=kube-dashboard -n kube-system
+kubectl get svc -n kube-system # get the external IP of the dashboard
+kubectl -n kube-system describe secret $(kubectl -n kube-system get secret | grep admin-token | awk '{print $1}') # get the admin token to log into the dashboard
+```
+### Install Coder
+Copy the `postgresql-pvc.yml`, `postgresql-values.yml`, and `coder-values.yml` files to the master node.
+```bash
+scp -J ubuntu@137.194.210.143 postgresql-pvc.yml postgresql-values.yml coder-values.yml core@$MASTER_IP:/tmp
+```
+Once the files are copied, log in to the master node and install Coder.
+```bash
+ssh -J ubuntu@137.194.210.143 core@$MASTER_IP
+sudo -i
+rpm-ostree install helm # no reboot required
+kubectl create namespace coder
+kubectl apply -f /tmp/postgresql-pvc.yml -n coder
+helm repo add bitnami https://charts.bitnami.com/bitnami
+helm install coder-db bitnami/postgresql -n coder -f /tmp/postgresql-values.yml --version 13.0.0 # latest compatible version with Helm 3.2 and thus coreos 35
+kubectl create secret generic coder-db-url -n coder --from-literal=url="postgres://coder:coder@coder-db-postgresql.coder.svc.cluster.local:5432/coder?sslmode=disable"
+helm repo add coder-v2 https://helm.coder.com/v2
+helm install coder coder-v2/coder --namespace coder --values /tmp/coder-values.yml --version 2.13.5 # Latest stable version
+```
+A public IP is created for the Coder load balancer, but it won't be useful, as the reverse proxy will be used to access Coder via HTTPS. Thus, we can remove the public IP.
+### Install the reverse proxy
+Retrieve the private IP of the Coder load balancer.
+```bash
+openstack loadbalancer list -f json | jq -r \
+    '.[] | select(.name | contains("coder")) | .vip_address'
+```
+(This can also be done via the OpenStack dashboard. And it is also possible to remove the public IP there.)
+Install the reverse proxy on the reverse proxy VM.
+```bash
+ssh ubuntu@137.194.210.143
+sudo -i
+apt install nginx python3-certbot-nginx
+certbot --nginx -d tp-inf110.r2.enst.fr
+```
+Edit the `/etc/nginx/sites-available/default` file (`vim /etc/nginx/sites-available/default`) and replace the `location /` block in the SSL server block with the following:
+```nginx
+        location / {
+                proxy_pass http://10.0.0.XXX;
+                proxy_http_version 1.1;
+                proxy_set_header Upgrade $http_upgrade;
+                proxy_set_header Connection "upgrade";
+                proxy_set_header Host $host;
+        }
+```
+where `10.0.0.XXX` was replaced with the private IP of the Coder load balancer.
+Restart the Nginx service.
+```bash
+systemctl restart nginx
+```
+## Coder management
+Access Coder via the reverse proxy (https://tp-inf110.r2.enst.fr) and create the admin user.
+Go to https://tp-inf110.r2.enst.fr/settings/tokens and create a new token that will be used to manage Coder via its API, in the `Coder-management.ipynb` notebook.
+Add its value to the `openrc` file:
+```bash
+export CODER_TOKEN= # the token
+```
+Launch `jupyter lab`, open the `Coder-management.ipynb` notebook, and follow the instructions there.
+## Troubleshooting
+### Cluster auto-scaling issues
+If the cluster is in "Update failed" state, and the underlying issue (e.g., quota exceeded) has been resolved, the cluster state can be restored using the following command:
+```bash
+openstack coe cluster resize inf110 $(openstack coe cluster show inf110 -f json | jq .node_count)
+```
+The above is a request to resize to the current (known) node count. If the current count is incorrect, you can specify a different count.
\ No newline at end of file
--- a/coder-values.yml
+++ b/coder-values.yml
+coder:
+  env:
+    - name: CODER_PG_CONNECTION_URL
+      valueFrom:
+        secretKeyRef:
+          name: coder-db-url
+          key: url
+    - name: CODER_ACCESS_URL
+      value: "https://tp-inf110.r2.enst.fr"
\ No newline at end of file
--- a/flake.lock
+++ b/flake.lock
+{
+  "nodes": {
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1725816686,
+        "narHash": "sha256-0Kq2MkQ/sQX1rhWJ/ySBBQlBJBUK8mPMDcuDhhdBkSU=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "add0443ee587a0c44f22793b8c8649a0dbc3bb00",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixpkgs-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
+{
+  inputs = {
+    nixpkgs.url = "github:/NixOS/nixpkgs/nixpkgs-unstable";
+  };
+  outputs = { self, nixpkgs }:
+    let pkgs = nixpkgs.legacyPackages.x86_64-linux;
+    in
+    {
+      devShell.x86_64-linux =
+        pkgs.mkShell {
+          buildInputs = with pkgs; [
+            (python311.withPackages
+              (ps: with ps; [
+                # We need to make sure that all Python the programs are installed
+                # with the same version of Python to avoid overriding PYTHONPATH
+                jupyter
+                python-openstackclient
+                python-magnumclient
+                python-octaviaclient
+                httpie
+              ]))
+            pwgen
+            jq
+          ];
+          shellHook = ''
+            source openrc
+          '';
+        };
+      dockerContainers.inf110-workspace =
+        pkgs.dockerTools.buildLayeredImage {
+          name = "inf110-workspace-base";
+          tag = "latest";
+          # Update to the latest version of the image with the output of:
+          # $ nix run nixpkgs#nix-prefetch-docker -- --image-name codercom/code-server --image-tag latest --arch amd64 --os linux
+          fromImage = pkgs.dockerTools.pullImage {
+            imageName = "codercom/code-server";
+            imageDigest = "sha256:f32cfa5e7cc768f60969f1d2183fc4c7aa093b80b93c365535e83f79ccf004f2";
+            sha256 = "0aa297aqrhfi0c3wybd9wld95axsn84g0caprzqrihdfxzh9bw91";
+            finalImageName = "codercom/code-server";
+            finalImageTag = "latest";
+          };
+          config = {
+            User = "coder";
+            Env = [
+              "LANG=en_US.UTF-8"
+              "HOME=/home/coder"
+            ];
+          };
+          contents = with pkgs; buildEnv {
+            name = "copyToRoot";
+            paths = [
+              bashInteractive
+              coq_8_19
+              coqPackages_8_19.coq-lsp
+              dockerTools.binSh
+              (python3.withPackages (ps: with ps; [ jupyter-client ipykernel ]))
+            ];
+            pathsToLink = [ "/bin" ];
+          };
+        };
+    };
+}
\ No newline at end of file
--- a/inf110-workspace/README.md
+++ b/inf110-workspace/README.md
+---
+display_name: TP d'INF110
+description: Espace de travail pour les TP d'INF110
+icon: ../../../site/static/emojis/1f4d0.png
+maintainer_github: coder
+verified: true
+tags: []
+---
+# Espace de travail pour les TP d'INF110
+Ce template sert à créer les espaces de travail pour les TP d'INF110.
+Un espace de travail a déjà été créé pour vous. Les fichiers des TP y ont été copiés. N'essayez pas de créer un nouvel espace de travail, car les fichiers des TP n'y seraient pas présents.
\ No newline at end of file
--- a/inf110-workspace/main.tf
+++ b/inf110-workspace/main.tf
+terraform {
+  required_providers {
+    coder = {
+      source = "coder/coder"
+    }
+    kubernetes = {
+      source = "hashicorp/kubernetes"
+    }
+  }
+}
+provider "coder" {
+}
+variable "use_kubeconfig" {
+  type        = bool
+  description = <<-EOF
+  Use host kubeconfig? (true/false)
+  Set this to false if the Coder host is itself running as a Pod on the same
+  Kubernetes cluster as you are deploying workspaces to.
+  Set this to true if the Coder host is running outside the Kubernetes cluster
+  for workspaces.  A valid "~/.kube/config" must be present on the Coder host.
+  EOF
+  default     = false
+}
+variable "namespace" {
+  type        = string
+  description = "The Kubernetes namespace to create workspaces in (must exist prior to creating workspaces). If the Coder host is itself running as a Pod on the same Kubernetes cluster as you are deploying workspaces to, set this to the same namespace."
+  default = "coder"
+}
+variable "cpu" {
+  type        = string
+  description  = "The number of CPU cores"
+  default      = "2"
+}
+variable "memory" {
+  type        = string
+  description  = "The amount of memory in GB"
+  default      = "2"
+}
+variable "disk_size" {
+  type        = string
+  description  = "The size of the persistent disk in GB"
+  default      = "5"
+}
+provider "kubernetes" {
+  # Authenticate via ~/.kube/config or a Coder-specific ServiceAccount, depending on admin preferences
+  config_path = var.use_kubeconfig == true ? "~/.kube/config" : null
+}
+data "coder_workspace" "me" {}
+data "coder_workspace_owner" "me" {}
+resource "coder_agent" "inf110" {
+  os             = "linux"
+  arch           = "amd64"
+  startup_script_behavior = "blocking"
+  startup_script = <<-EOT
+    set -e
+    # run code-server
+    code-server --auth none --port 13337 >/tmp/code-server.log 2>&1 &
+  EOT
+  # The following metadata blocks are optional. They are used to display
+  # information about your workspace in the dashboard. You can remove them
+  # if you don't want to display any information.
+  # For basic resources, you can use the `coder stat` command.
+  # If you need more control, you can write your own script.
+  metadata {
+    display_name = "CPU Usage"
+    key          = "0_cpu_usage"
+    script       = "coder stat cpu"
+    interval     = 10
+    timeout      = 1
+  }
+  metadata {
+    display_name = "RAM Usage"
+    key          = "1_ram_usage"
+    script       = "coder stat mem"
+    interval     = 10
+    timeout      = 1
+  }
+  metadata {
+    display_name = "Disk Usage"
+    key          = "2_disk_usage"
+    script       = "coder stat disk --path $${HOME}/tp"
+    interval     = 60
+    timeout      = 1
+  }
+  metadata {
+    display_name = "CPU Usage (Host)"
+    key          = "4_cpu_usage_host"
+    script       = "coder stat cpu --host"
+    interval     = 10
+    timeout      = 1
+  }
+  metadata {
+    display_name = "Memory Usage (Host)"
+    key          = "5_mem_usage_host"
+    script       = "coder stat mem --host"
+    interval     = 10
+    timeout      = 1
+  }
+  metadata {
+    display_name = "Load Average (Host)"
+    key          = "6_load_host"
+    # get load avg scaled by number of cores
+    script   = <<EOT
+      echo "`cat /proc/loadavg | awk '{ print $1 }'` `nproc`" | awk '{ printf "%0.2f", $1/$2 }'
+    EOT
+    interval = 60
+    timeout  = 1
+  }
+  display_apps {
+    # We disable access to other apps than code-server
+    vscode          = false
+    vscode_insiders = false
+    web_terminal    = false
+    ssh_helper      = false
+    port_forwarding_helper = false
+  }
+}
+# code-server
+resource "coder_app" "code-server" {
+  agent_id     = coder_agent.inf110.id
+  slug         = "code-server"
+  display_name = "Cliquez ici pour démarrer VS Code"
+  icon         = "/icon/code.svg"
+  url          = "http://localhost:13337?folder=/home/coder/tp"
+  subdomain    = false
+  share        = "owner"
+  healthcheck {
+    url       = "http://localhost:13337/healthz"
+    interval  = 3
+    threshold = 10
+  }
+}
+resource "kubernetes_persistent_volume_claim" "tp_inf110_folder" {
+  metadata {
+    name      = "coder-pvc-${lower(data.coder_workspace_owner.me.name)}-tp-inf110-${lower(data.coder_workspace.me.name)}"
+    namespace = var.namespace
+    labels = {
+      "app.kubernetes.io/name"     = "coder-pvc"
+      "app.kubernetes.io/instance" = "coder-pvc-${lower(data.coder_workspace_owner.me.name)}-tp-inf110-${lower(data.coder_workspace.me.name)}"
+      "app.kubernetes.io/part-of"  = "coder"
+      //Coder-specific labels.
+      "com.coder.resource"       = "true"
+      "com.coder.workspace.id"   = data.coder_workspace.me.id
+      "com.coder.workspace.name" = data.coder_workspace.me.name
+      "com.coder.user.id"        = data.coder_workspace_owner.me.id
+      "com.coder.user.username"  = data.coder_workspace_owner.me.name
+    }
+    annotations = {
+      "com.coder.user.email" = data.coder_workspace_owner.me.email
+    }
+  }
+  wait_until_bound = false
+  spec {
+    access_modes = ["ReadWriteOnce"]
+    resources {
+      requests = {
+        storage = "${var.disk_size}Gi"
+      }
+    }
+    # This storage class was created during the Kubernetes cluster setup
+    # with the `postgresql-pvc.yml` file.
+    storage_class_name = "csi-sc-cinderplugin-lvm-nvme"
+  }
+}
+resource "kubernetes_deployment" "main" {
+  count = data.coder_workspace.me.start_count
+  depends_on = [
+    kubernetes_persistent_volume_claim.tp_inf110_folder
+  ]
+  wait_for_rollout = false
+  metadata {
+    name      = "coder-${lower(data.coder_workspace_owner.me.name)}-tp-inf110-${lower(data.coder_workspace.me.name)}"
+    namespace = var.namespace
+    labels = {
+      "app.kubernetes.io/name"     = "coder-workspace"
+      "app.kubernetes.io/instance" = "coder-workspace-${lower(data.coder_workspace_owner.me.name)}-tp-inf110-${lower(data.coder_workspace.me.name)}"
+      "app.kubernetes.io/part-of"  = "coder"
+      "com.coder.resource"         = "true"
+      "com.coder.workspace.id"     = data.coder_workspace.me.id
+      "com.coder.workspace.name"   = data.coder_workspace.me.name
+      "com.coder.user.id"          = data.coder_workspace_owner.me.id
+      "com.coder.user.username"    = data.coder_workspace_owner.me.name
+    }
+    annotations = {
+      "com.coder.user.email" = data.coder_workspace_owner.me.email
+    }
+  }
+  spec {
+    replicas = 1
+    selector {
+      match_labels = {
+        "app.kubernetes.io/name" = "coder-workspace"
+      }
+    }
+    strategy {
+      type = "Recreate"
+    }
+    template {
+      metadata {
+        labels = {
+          "app.kubernetes.io/name" = "coder-workspace"
+        }
+      }
+      spec {
+        security_context {
+          run_as_user = 1000
+          fs_group    = 1000
+        }
+        container {
+          name              = "dev"
+          # We pull the image from a local Docker registry inside the Kubernetes cluster network
+          image             = "10.0.0.99:5000/inf110-workspace:latest"
+          image_pull_policy = "Always"
+          command           = ["sh", "-c", coder_agent.inf110.init_script]
+          security_context {
+            run_as_user = "1000"
+          }
+          env {
+            name  = "CODER_AGENT_TOKEN"
+            value = coder_agent.inf110.token
+          }
+          resources {
+            # We make sure to request enough resources to avoid overloading nodes
+            # when all students are working in their workspaces at the same time
+            requests = {
+              "cpu"    = "1"
+              "memory" = "${var.memory}Gi"
+            }
+            limits = {
+              "cpu"    = "${var.cpu}"
+              "memory" = "${var.memory}Gi"
+            }
+          }
+          volume_mount {
+            # We mount the persistent volume claim to the /home/coder/tp directory
+            # and not to the /home/coder directory, because the latter contains
+            # the pre-installed code-server configuration and extensions.
+            mount_path = "/home/coder/tp"
+            name       = "tp"
+            read_only  = false
+          }
+        }
+        volume {
+          name = "tp"
+          persistent_volume_claim {
+            claim_name = kubernetes_persistent_volume_claim.tp_inf110_folder.metadata.0.name
+            read_only  = false
+          }
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
--- a/postgresql-pvc.yml
+++ b/postgresql-pvc.yml
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: csi-sc-cinderplugin-lvm-nvme
+parameters:
+  type: LVM-NVME
+provisioner: cinder.csi.openstack.org
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: coder-db
+spec:
+  accessModes:
+  - ReadWriteOnce
+  resources:
+    requests:
+      storage: 10Gi
+  storageClassName: csi-sc-cinderplugin-lvm-nvme
--- a/postgresql-values.yml
+++ b/postgresql-values.yml
+# define default database user, name, and password for PostgreSQL deployment
+auth:
+  enablePostgresUser: true
+  username: "coder"
+  password: "coder"
+  database: "coder"
+# The postgres helm chart deployment will be using PVC postgresql-data-claim
+primary:
+  persistence:
+    enabled: true
+    existingClaim: "coder-db"
\ No newline at end of file
--- a/user-settings.json
+++ b/user-settings.json
+{
+    "workbench.startupEditor": "readme"
+}
\ No newline at end of file