;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2015 David Thompson <davet@gnu.org>
;;; Copyright © 2016, 2017, 2019, 2020, 2021 Ludovic Courtès <ludo@gnu.org>
;;; Copyright © 2019 Arun Isaac <arunisaac@systemreboot.net>
;;; Copyright © 2020 Efraim Flashner <efraim@flashner.co.il>
;;; Copyright © 2020 Google LLC
;;;
;;; This file is part of GNU Guix.
;;;
;;; GNU Guix is free software; you can redistribute it and/or modify it
;;; under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 3 of the License, or (at
;;; your option) any later version.
;;;
;;; GNU Guix is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with GNU Guix.  If not, see <http://www.gnu.org/licenses/>.

(define-module (gnu system linux-container)
  #:use-module (ice-9 match)
  #:use-module (srfi srfi-1)
  #:use-module (guix config)
  #:use-module (guix store)
  #:use-module (guix gexp)
  #:use-module (guix derivations)
  #:use-module (guix monads)
  #:use-module (guix modules)
  #:use-module (gnu build linux-container)
  #:use-module (gnu services)
  #:use-module (gnu services base)
  #:use-module (gnu services networking)
  #:use-module (gnu services shepherd)
  #:use-module (gnu system)
  #:use-module (gnu system file-systems)
  #:export (system-container
            containerized-operating-system
            container-script
            eval/container))

(define* (container-essential-services os #:key shared-network?)
  "Return a list of essential services corresponding to OS, a
non-containerized OS.  This procedure essentially strips essential services
from OS that are needed on the bare metal and not in a container."
  (define base
    (remove (lambda (service)
              (memq (service-kind service)
                    (list (service-kind %linux-bare-metal-service)
                          firmware-service-type
                          system-service-type)))
            (operating-system-default-essential-services os)))

  (cons (service system-service-type
                 `(("locale" ,(operating-system-locale-directory os))))
        ;; If network is to be shared with the host, remove network
        ;; configuration files from etc-service.
        (if shared-network?
            (modify-services base
              (etc-service-type
               files => (remove
                         (match-lambda
                           ((filename _)
                            (member filename
                                    (map basename %network-configuration-files))))
                         files)))
            base)))

(define dummy-networking-service-type
  (shepherd-service-type
   'dummy-networking
   (const (shepherd-service
           (documentation "Provide loopback and networking without actually
doing anything.")
           (provision '(loopback networking))
           (start #~(const #t))))
   #f
   (description "Provide loopback and networking without actually doing
anything.  This service is used by guest systems running in containers, where
networking support is provided by the host.")))

(define %nscd-container-caches
  ;; Similar to %nscd-default-caches but with smaller cache sizes. This allows
  ;; many containers to coexist on the same machine without exhausting RAM.
  (map (lambda (cache)
         (nscd-cache
          (inherit cache)
          (max-database-size (expt 2 18)))) ;256KiB
       %nscd-default-caches))

(define* (containerized-operating-system os mappings
                                         #:key
                                         shared-network?
                                         (extra-file-systems '()))
  "Return an operating system based on OS for use in a Linux container
environment.  MAPPINGS is a list of <file-system-mapping> to realize in the
containerized OS.  EXTRA-FILE-SYSTEMS is a list of file systems to add to OS."
  (define user-file-systems
    (remove (lambda (fs)
              (let ((target (file-system-mount-point fs))
                    (source (file-system-device fs)))
                (or (string=? target (%store-prefix))
                    (string=? target "/")
                    (and (string? source)
                         (string-prefix? "/dev/" source))
                    (string-prefix? "/dev/" target)
                    (string-prefix? "/sys/" target))))
            (operating-system-file-systems os)))

  (define (mapping->fs fs)
    (file-system (inherit (file-system-mapping->bind-mount fs))
      (needed-for-boot? #t)))

  (define services-to-drop
    ;; Service types to filter from the original operating-system. Some of
    ;; these make no sense in a container (e.g., those that access
    ;; /dev/tty[0-9]), while others just need to be reinstantiated with
    ;; different configs that are better suited to containers.
    (append (list console-font-service-type
                  mingetty-service-type
                  agetty-service-type
                  ;; Reinstantiated below with smaller caches.
                  nscd-service-type)
            (if shared-network?
                ;; Replace these with dummy-networking-service-type below.
                (list
                 static-networking-service-type
                 dhcp-client-service-type
                 network-manager-service-type
                 connman-service-type
                 wicd-service-type)
                (list))))

  (define services-to-add
    (append
     ;; Many Guix services depend on a 'networking' shepherd
     ;; service, so make sure to provide a dummy 'networking'
     ;; service when we are sure that networking is already set up
     ;; in the host and can be used.  That prevents double setup.
     (if shared-network?
         (list (service dummy-networking-service-type))
         '())
     (list
      (nscd-service (nscd-configuration
                     (caches %nscd-container-caches))))))

  (operating-system
    (inherit os)
    (swap-devices '()) ; disable swap
    (essential-services (container-essential-services
                         this-operating-system
                         #:shared-network? shared-network?))
    (services (append (remove (lambda (service)
                                (memq (service-kind service)
                                      services-to-drop))
                              (operating-system-user-services os))
                      services-to-add))
    (file-systems (append (map mapping->fs
                               (if shared-network?
                                   (append %network-file-mappings mappings)
                                   mappings))
                          extra-file-systems
                          user-file-systems

                          ;; Provide a dummy root file system so we can create
                          ;; a 'boot-parameters' file.
                          (list (file-system
                                  (mount-point "/")
                                  (device "nothing")
                                  (type "dummy")))))))

(define* (container-script os #:key (mappings '()) shared-network?)
  "Return a derivation of a script that runs OS as a Linux container.
MAPPINGS is a list of <file-system> objects that specify the files/directories
that will be shared with the host system."
  (define (mountable-file-system? file-system)
    ;; Return #t if FILE-SYSTEM should be mounted in the container.
    (and (not (string=? "/" (file-system-mount-point file-system)))
         (file-system-needed-for-boot? file-system)))

  (define (os-file-system-specs os)
    (map file-system->spec
         (filter mountable-file-system?
                 (operating-system-file-systems os))))

  (let* ((os (containerized-operating-system
              os (cons %store-mapping mappings)
              #:shared-network? shared-network?
              #:extra-file-systems %container-file-systems))
         (specs (os-file-system-specs os)))

    (define script
      (with-imported-modules (source-module-closure
                              '((guix build utils)
                                (gnu build linux-container)
                                (guix i18n)
                                (guix diagnostics)))
        #~(begin
            (use-modules (gnu build linux-container)
                         (gnu system file-systems) ;spec->file-system
                         (guix build utils)
                         (guix i18n)
                         (guix diagnostics)
                         (srfi srfi-1))

            (define file-systems
              (filter-map (lambda (spec)
                            (let* ((fs    (spec->file-system spec))
                                   (flags (file-system-flags fs)))
                              (and (or (not (memq 'bind-mount flags))
                                       (file-exists? (file-system-device fs)))
                                   fs)))
                          '#$specs))

            (define (explain pid)
              ;; XXX: We can't quite call 'bindtextdomain' so there's actually
              ;; no i18n.
              ;; XXX: Should we really give both options? 'guix container exec'
              ;; is a more verbose command.  Hard to fail to enter the container
              ;; when we list two options.
              (info (G_ "system container is running as PID ~a~%") pid)
              (info (G_ "Run 'sudo guix container exec ~a /run/current-system/profile/bin/bash --login'\n")
                    pid)
              (info (G_ "or run 'sudo nsenter -a -t ~a' to get a shell into it.~%") pid)
              (newline (guix-warning-port)))

            (call-with-container file-systems
              (lambda ()
                (setenv "HOME" "/root")
                (setenv "TMPDIR" "/tmp")
                (setenv "GUIX_NEW_SYSTEM" #$os)
                (for-each mkdir-p '("/run" "/bin" "/etc" "/home" "/var"))
                (primitive-load (string-append #$os "/boot")))
              ;; A range of 65536 uid/gids is used to cover 16 bits worth of
              ;; users and groups, which is sufficient for most cases.
              ;;
              ;; See: http://www.freedesktop.org/software/systemd/man/systemd-nspawn.html#--private-users=
              #:host-uids 65536
              #:namespaces (if #$shared-network?
                               (delq 'net %namespaces)
                               %namespaces)
              #:process-spawned-hook explain))))

    (gexp->script "run-container" script)))

(define* (eval/container exp
                         #:key
                         (mappings '())
                         (namespaces %namespaces))
  "Evaluate EXP, a gexp, in a new process executing in separate namespaces as
listed in NAMESPACES.  Add MAPPINGS, a list of <file-system-mapping>, to the
set of directories visible in the process's mount namespace.  Return the
process' exit status as a monadic value.

This is useful to implement processes that, unlike derivations, are not
entirely pure and need to access the outside world or to perform side
effects."
  (mlet %store-monad ((lowered (lower-gexp exp)))
    (define inputs
      (cons (lowered-gexp-guile lowered)
            (lowered-gexp-inputs lowered)))

    (define items
      (append (append-map derivation-input-output-paths inputs)
              (lowered-gexp-sources lowered)))

    (mbegin %store-monad
      (built-derivations inputs)
      (mlet %store-monad ((closure ((store-lift requisites) items)))
        (return (call-with-container (map file-system-mapping->bind-mount
                                          (append (map (lambda (item)
                                                         (file-system-mapping
                                                          (source item)
                                                          (target source)))
                                                       closure)
                                                  mappings))
                  (lambda ()
                    (apply execl
                           (string-append (derivation-input-output-path
                                           (lowered-gexp-guile lowered))
                                          "/bin/guile")
                           "guile"
                           (append (append-map (lambda (directory)
                                                 `("-L" ,directory))
                                               (lowered-gexp-load-path lowered))
                                   (append-map (lambda (directory)
                                                 `("-C" ,directory))
                                               (lowered-gexp-load-compiled-path
                                                lowered))
                                   (list "-c"
                                         (object->string
                                          (lowered-gexp-sexp lowered))))))))))))