生成创建容器所需配置

基于kubernetes v1.18.6,关于基于windows平台运行kubelet的相关代码逻辑不作解析。

概述

kubelet通过以下四个步骤,来启动pod容器:

  1. 拉取镜像
  2. 创建容器
  3. 启动容器
  4. 执行容器启动后的钩子

其中创建容器又分为以下子步骤:

  1. 设置容器重启次数
  2. 生成创建容器所需配置
  3. 创建容器
  4. 预启动容器
  5. 生成容器引用信息

本文主要解析创建容器/生成创建容器所需配置阶段kubelet所做工作,首先我们先看下生成创建容器所需配置阶段的代码逻辑

func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, spec *startSpec, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string, podIPs []string) (string, error) {
...
    // 生成容器配置-获取临时容器ID
    target, err := spec.getTargetID(podStatus)
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
        return s.Message(), ErrCreateContainerConfig
    }

    containerConfig, cleanupAction, err := m.generateContainerConfig(container, pod, restartCount, podIP, imageRef, podIPs, target)
    if cleanupAction != nil {
        defer cleanupAction()
    }
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
        return s.Message(), ErrCreateContainerConfig
    }
...
}

其核心调用为m.generateContainerConfig(),接下来我们对其深入分析:

配置生成逻辑解析

m.generateContainerConfig()主要逻辑如下

  1. 生成创建容器所需配置
  2. 根据镜像名称,调用容器运行时,获取运行容器启动命令的用户
  3. 检测运行容器启动命令的用户判是否违反pod安全上下文设置(runAsNonRoot: true时,不允许容器以root用户启动)
  4. 生成日志目录(格式为: /var/log/pods/<pod namespace>_<pod name>_<pod uid>/<容器名称>
  5. 针对windows平台,定义额外配置
  6. 定义容器内的环境变量
  7. 组装配置项并返回

源码实现:

kubernetes\pkg\kubelet\kuberuntime\kuberuntime_container.go

// generateContainerConfig generates container config for kubelet runtime v1.
func (m *kubeGenericRuntimeManager) generateContainerConfig(container *v1.Container, pod *v1.Pod, restartCount int, podIP, imageRef string, podIPs []string, nsTarget *kubecontainer.ContainerID) (*runtimeapi.ContainerConfig, func(), error) {
    // 生成创建容器所需配置:环境变量列表、挂载点信息列表、映射到容器中的主机设备列表、容器端口映射列表、容器注解列表、容器根文件系统是否只读、主机名、
    //
    opts, cleanupAction, err := m.runtimeHelper.GenerateRunContainerOptions(pod, container, podIP, podIPs)
    if err != nil {
        return nil, nil, err
    }

    // 根据镜像名称,调用容器运行时,获取运行容器启动命令的用户
    uid, username, err := m.getImageUser(container.Image)
    if err != nil {
        return nil, cleanupAction, err
    }

    // Verify RunAsNonRoot. Non-root verification only supports numeric user.
    // 检测运行容器启动命令的用户判是否违反pod安全上下文设置(runAsNonRoot: true时,不允许容器以root用户启动)
    if err := verifyRunAsNonRoot(pod, container, uid, username); err != nil {
        return nil, cleanupAction, err
    }

    // 解析容器的启动命令与参数
    command, args := kubecontainer.ExpandContainerCommandAndArgs(container, opts.Envs)

    // 生成日志目录(格式为: /var/log/pods/<pod namespace>_<pod name>_<pod uid>/<容器名称>)
    logDir := BuildContainerLogsDirectory(pod.Namespace, pod.Name, pod.UID, container.Name)
    err = m.osInterface.MkdirAll(logDir, 0755)
    if err != nil {
        return nil, cleanupAction, fmt.Errorf("create container log directory for container %s failed: %v", container.Name, err)
    }

    // 定义pod下容器日志路径:<容器名称>/<容器重启次数>.log
    containerLogsPath := buildContainerLogsPath(container.Name, restartCount)

    restartCountUint32 := uint32(restartCount)
    // 组装容器配置
    config := &runtimeapi.ContainerConfig{
        Metadata: &runtimeapi.ContainerMetadata{
            Name:    container.Name,
            Attempt: restartCountUint32,
        },
        Image:       &runtimeapi.ImageSpec{Image: imageRef},
        Command:     command,
        Args:        args,
        WorkingDir:  container.WorkingDir,
        Labels:      newContainerLabels(container, pod),
        Annotations: newContainerAnnotations(container, pod, restartCount, opts),
        Devices:     makeDevices(opts),
        Mounts:      m.makeMounts(opts, container),
        LogPath:     containerLogsPath,
        Stdin:       container.Stdin,
        StdinOnce:   container.StdinOnce,
        Tty:         container.TTY,
    }

    // set platform specific configurations.
    // 针对windows,定义额外配置
    if err := m.applyPlatformSpecificContainerConfig(config, container, pod, uid, username, nsTarget); err != nil {
        return nil, cleanupAction, err
    }

    // set environment variables
    // 定义容器内的环境变量
    envs := make([]*runtimeapi.KeyValue, len(opts.Envs))
    for idx := range opts.Envs {
        e := opts.Envs[idx]
        envs[idx] = &runtimeapi.KeyValue{
            Key:   e.Name,
            Value: e.Value,
        }
    }
    config.Envs = envs

    return config, cleanupAction, nil
}

接下来我们分析下generateContainerConfig()的返回值

m.generateContainerConfig()返回值解析

  1. 返回值一containerConfig: ContainerConfig对象,容器配置属性。

用以创建容器所需信息,该对象属性根据pod清单文件生成。数据结构如下:

  • 容器原生配置
    • Metadata: 主要定义容器名称
    • Image: 运行该容器的镜像
    • Command: 容器执行的命令
    • Args: 容器执行的命令的参数
    • WorkingDir: 容器工作目录(容器运行后执行命令的上下文目录)
    • Envs: 容器的环境变量列表
    • Mounts: 容器的挂载点集合
    • Devices: 映射到容器中的主机设备列表
    • Labels: 容器标签列表(k8s会注入额外的标签,如io.kubernetes.pod.nameio.kubernetes.pod.uid等)
    • Annotations: 容器注解列表(k8s会注入额外的注解,如io.kubernetes.container.hashio.kubernetes.container.restartCount等)
  • k8s下容器额外配置
    • Stdin: 标准输入
    • StdinOnce
    • Tty
    • Linux: 包含容器配额(LinuxContainerResources)及安全配置(LinuxContainerSecurityContext)
// ContainerConfig holds all the required and optional fields for creating a
// container.
type ContainerConfig struct {
    // Metadata of the container. This information will uniquely identify the
    // container, and the runtime should leverage this to ensure correct
    // operation. The runtime may also use this information to improve UX, such
    // as by constructing a readable name.
    Metadata *ContainerMetadata `protobuf:"bytes,1,opt,name=metadata,proto3" json:"metadata,omitempty"`
    // Image to use.
    Image *ImageSpec `protobuf:"bytes,2,opt,name=image,proto3" json:"image,omitempty"`
    // Command to execute (i.e., entrypoint for docker)
    Command []string `protobuf:"bytes,3,rep,name=command,proto3" json:"command,omitempty"`
    // Args for the Command (i.e., command for docker)
    Args []string `protobuf:"bytes,4,rep,name=args,proto3" json:"args,omitempty"`
    // Current working directory of the command.
    WorkingDir string `protobuf:"bytes,5,opt,name=working_dir,json=workingDir,proto3" json:"working_dir,omitempty"`
    // List of environment variable to set in the container.
    Envs []*KeyValue `protobuf:"bytes,6,rep,name=envs,proto3" json:"envs,omitempty"`
    // Mounts for the container.
    Mounts []*Mount `protobuf:"bytes,7,rep,name=mounts,proto3" json:"mounts,omitempty"`
    // Devices for the container.
    Devices []*Device `protobuf:"bytes,8,rep,name=devices,proto3" json:"devices,omitempty"`
    // Key-value pairs that may be used to scope and select individual resources.
    // Label keys are of the form:
    //     label-key ::= prefixed-name | name
    //     prefixed-name ::= prefix '/' name
    //     prefix ::= DNS_SUBDOMAIN
    //     name ::= DNS_LABEL
    Labels map[string]string `protobuf:"bytes,9,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
    // Unstructured key-value map that may be used by the kubelet to store and
    // retrieve arbitrary metadata.
    //
    // Annotations MUST NOT be altered by the runtime; the annotations stored
    // here MUST be returned in the ContainerStatus associated with the container
    // this ContainerConfig creates.
    //
    // In general, in order to preserve a well-defined interface between the
    // kubelet and the container runtime, annotations SHOULD NOT influence
    // runtime behaviour.
    Annotations map[string]string `protobuf:"bytes,10,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
    // Path relative to PodSandboxConfig.LogDirectory for container to store
    // the log (STDOUT and STDERR) on the host.
    // E.g.,
    //     PodSandboxConfig.LogDirectory = `/var/log/pods/<podUID>/`
    //     ContainerConfig.LogPath = `containerName/Instance#.log`
    //
    // WARNING: Log management and how kubelet should interface with the
    // container logs are under active discussion in
    // https://issues.k8s.io/24677. There *may* be future change of direction
    // for logging as the discussion carries on.
    LogPath string `protobuf:"bytes,11,opt,name=log_path,json=logPath,proto3" json:"log_path,omitempty"`
    // Variables for interactive containers, these have very specialized
    // use-cases (e.g. debugging).
    // TODO: Determine if we need to continue supporting these fields that are
    // part of Kubernetes's Container Spec.
    Stdin     bool `protobuf:"varint,12,opt,name=stdin,proto3" json:"stdin,omitempty"`
    StdinOnce bool `protobuf:"varint,13,opt,name=stdin_once,json=stdinOnce,proto3" json:"stdin_once,omitempty"`
    Tty       bool `protobuf:"varint,14,opt,name=tty,proto3" json:"tty,omitempty"`
    // Configuration specific to Linux containers.
    Linux *LinuxContainerConfig `protobuf:"bytes,15,opt,name=linux,proto3" json:"linux,omitempty"`
    // Configuration specific to Windows containers.
    Windows              *WindowsContainerConfig `protobuf:"bytes,16,opt,name=windows,proto3" json:"windows,omitempty"`
    XXX_NoUnkeyedLiteral struct{}                `json:"-"`
    XXX_sizecache        int32                   `json:"-"`
}

容器资源配额对象LinuxContainerResources

type LinuxContainerResources struct {
    // CPU CFS (Completely Fair Scheduler) period. Default: 0 (not specified).
    CpuPeriod int64 `protobuf:"varint,1,opt,name=cpu_period,json=cpuPeriod,proto3" json:"cpu_period,omitempty"`
    // CPU CFS (Completely Fair Scheduler) quota. Default: 0 (not specified).
    CpuQuota int64 `protobuf:"varint,2,opt,name=cpu_quota,json=cpuQuota,proto3" json:"cpu_quota,omitempty"`
    // CPU shares (relative weight vs. other containers). Default: 0 (not specified).
    CpuShares int64 `protobuf:"varint,3,opt,name=cpu_shares,json=cpuShares,proto3" json:"cpu_shares,omitempty"`
    // Memory limit in bytes. Default: 0 (not specified).
    MemoryLimitInBytes int64 `protobuf:"varint,4,opt,name=memory_limit_in_bytes,json=memoryLimitInBytes,proto3" json:"memory_limit_in_bytes,omitempty"`
    // OOMScoreAdj adjusts the oom-killer score. Default: 0 (not specified).
    OomScoreAdj int64 `protobuf:"varint,5,opt,name=oom_score_adj,json=oomScoreAdj,proto3" json:"oom_score_adj,omitempty"`
    // CpusetCpus constrains the allowed set of logical CPUs. Default: "" (not specified).
    CpusetCpus string `protobuf:"bytes,6,opt,name=cpuset_cpus,json=cpusetCpus,proto3" json:"cpuset_cpus,omitempty"`
    // CpusetMems constrains the allowed set of memory nodes. Default: "" (not specified).
    CpusetMems string `protobuf:"bytes,7,opt,name=cpuset_mems,json=cpusetMems,proto3" json:"cpuset_mems,omitempty"`
    // List of HugepageLimits to limit the HugeTLB usage of container per page size. Default: nil (not specified).
    HugepageLimits       []*HugepageLimit `protobuf:"bytes,8,rep,name=hugepage_limits,json=hugepageLimits,proto3" json:"hugepage_limits,omitempty"`
    XXX_NoUnkeyedLiteral struct{}         `json:"-"`
    XXX_sizecache        int32            `json:"-"`
}

容器安全配置对象LinuxContainerSecurityContext

// LinuxContainerSecurityContext holds linux security configuration that will be applied to a container.
type LinuxContainerSecurityContext struct {
    // Capabilities to add or drop.
    Capabilities *Capability `protobuf:"bytes,1,opt,name=capabilities,proto3" json:"capabilities,omitempty"`
    // 特权模式下,可以做以下事情:
    // 1. 所有的linux能力将被添加.
    // 2. 敏感路径(例如sysfs中的内核模块路径)不会被屏蔽。
    // 3. 任何sysfs和procfs都以读写权限挂载。
    // 4. Apparmor将不会被配置
    // 5. Seccomp将不会被配置
    // 6. 设备cgroup不限制对任何设备的访问
    // 7. 主机/dev中的所有设备都可以在容器中使用。
    // 8. SELinux将不会被配置
    Privileged bool `protobuf:"varint,2,opt,name=privileged,proto3" json:"privileged,omitempty"`
    // Configurations for the container's namespaces.
    // Only used if the container uses namespace for isolation.
    NamespaceOptions *NamespaceOption `protobuf:"bytes,3,opt,name=namespace_options,json=namespaceOptions,proto3" json:"namespace_options,omitempty"`
    // SELinux context to be optionally applied.
    SelinuxOptions *SELinuxOption `protobuf:"bytes,4,opt,name=selinux_options,json=selinuxOptions,proto3" json:"selinux_options,omitempty"`
    // 以那个用户运行容器(用户id)
    RunAsUser *Int64Value `protobuf:"bytes,5,opt,name=run_as_user,json=runAsUser,proto3" json:"run_as_user,omitempty"`
    // GID to run the container process as. run_as_group should only be specified
    // when run_as_user or run_as_username is specified; otherwise, the runtime
    // MUST error.
    RunAsGroup *Int64Value `protobuf:"bytes,12,opt,name=run_as_group,json=runAsGroup,proto3" json:"run_as_group,omitempty"`
    // 以那个用户运行容器(用户名称,该用户必需存在,不会自动创建)
    RunAsUsername string `protobuf:"bytes,6,opt,name=run_as_username,json=runAsUsername,proto3" json:"run_as_username,omitempty"`
    // 根文件系统是否只读
    ReadonlyRootfs bool `protobuf:"varint,7,opt,name=readonly_rootfs,json=readonlyRootfs,proto3" json:"readonly_rootfs,omitempty"`
    // List of groups applied to the first process run in the container, in
    // addition to the container's primary GID.
    SupplementalGroups []int64 `protobuf:"varint,8,rep,packed,name=supplemental_groups,json=supplementalGroups,proto3" json:"supplemental_groups,omitempty"`
    // AppArmor profile for the container, candidate values are:
    // * runtime/default: equivalent to not specifying a profile.
    // * unconfined: no profiles are loaded
    // * localhost/<profile_name>: profile loaded on the node
    //    (localhost) by name. The possible profile names are detailed at
    //    http://wiki.apparmor.net/index.php/AppArmor_Core_Policy_Reference
    ApparmorProfile string `protobuf:"bytes,9,opt,name=apparmor_profile,json=apparmorProfile,proto3" json:"apparmor_profile,omitempty"`
    // Seccomp profile for the container, candidate values are:
    // * runtime/default: the default profile for the container runtime
    // * unconfined: unconfined profile, ie, no seccomp sandboxing
    // * localhost/<full-path-to-profile>: the profile installed on the node.
    //   <full-path-to-profile> is the full path of the profile.
    // Default: "", which is identical with unconfined.
    SeccompProfilePath string `protobuf:"bytes,10,opt,name=seccomp_profile_path,json=seccompProfilePath,proto3" json:"seccomp_profile_path,omitempty"`
    // no_new_privs defines if the flag for no_new_privs should be set on the
    // container.
    NoNewPrivs bool `protobuf:"varint,11,opt,name=no_new_privs,json=noNewPrivs,proto3" json:"no_new_privs,omitempty"`
    // 需要隐藏的路径
    MaskedPaths []string `protobuf:"bytes,13,rep,name=masked_paths,json=maskedPaths,proto3" json:"masked_paths,omitempty"`
    // readonly_paths is a slice of paths that should be set as readonly by the
    // container runtime, this can be passed directly to the OCI spec.
    ReadonlyPaths        []string `protobuf:"bytes,14,rep,name=readonly_paths,json=readonlyPaths,proto3" json:"readonly_paths,omitempty"`
    XXX_NoUnkeyedLiteral struct{} `json:"-"`
    XXX_sizecache        int32    `json:"-"`
}
  1. 返回值二cleanupAction: 容器带有子路径的卷成功运行或启动失败后的回调函数

  2. 返回值三为异常(error)

GenerateRunContainerOptions()函数解析

函数逻辑如下:

生成RunContainerOptions对象,赋值以下字段:

  • Devices: 设备列表
  • Annotations: 注释列表
  • PortMappings: 端口映射列表
  • Envs: 环境变量列表(从ConfigMapSecret中获取)
  • Hostname: 主机名称(默认pod名称,可通过.spec.hostname设置,格式必须为: 由小写字母数字字符或'-'组成,并且必须以字母数字字符开始和结束,并且小于等于63个字符。如果大于63个字符会自动截取)
  • EnableHostUserNamespace: 是否使用host命名空间
  • PodContainerDir:

源码实现

kubernetes\pkg\kubelet\kubelet_pods.go

// GenerateRunContainerOptions generates the RunContainerOptions, which can be used by
// the container runtime to set parameters for launching a container.
func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Container, podIP string, podIPs []string) (*kubecontainer.RunContainerOptions, func(), error) {
    opts, err := kl.containerManager.GetResources(pod, container)
    if err != nil {
        return nil, nil, err
    }

    // 定义pod的hostname与hostDomainName
    hostname, hostDomainName, err := kl.GeneratePodHostNameAndDomain(pod)
    if err != nil {
        return nil, nil, err
    }
    opts.Hostname = hostname
    podName := volumeutil.GetUniquePodName(pod)
    volumes := kl.volumeManager.GetMountedVolumesForPod(podName)

    opts.PortMappings = kubecontainer.MakePortMappings(container)

    blkutil := volumepathhandler.NewBlockVolumePathHandler()
    blkVolumes, err := kl.makeBlockVolumes(pod, container, volumes, blkutil)
    if err != nil {
        return nil, nil, err
    }
    opts.Devices = append(opts.Devices, blkVolumes...)

    envs, err := kl.makeEnvironmentVariables(pod, container, podIP, podIPs)
    if err != nil {
        return nil, nil, err
    }
    opts.Envs = append(opts.Envs, envs...)

    // only podIPs is sent to makeMounts, as podIPs is populated even if dual-stack feature flag is not enabled.
    mounts, cleanupAction, err := makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIPs, volumes, kl.hostutil, kl.subpather, opts.Envs)
    if err != nil {
        return nil, cleanupAction, err
    }
    opts.Mounts = append(opts.Mounts, mounts...)

    // adding TerminationMessagePath on Windows is only allowed if ContainerD is used. Individual files cannot
    // be mounted as volumes using Docker for Windows.
    supportsSingleFileMapping := kl.containerRuntime.SupportsSingleFileMapping()
    if len(container.TerminationMessagePath) != 0 && supportsSingleFileMapping {
        p := kl.getPodContainerDir(pod.UID, container.Name)
        if err := os.MkdirAll(p, 0750); err != nil {
            klog.Errorf("Error on creating %q: %v", p, err)
        } else {
            opts.PodContainerDir = p
        }
    }

    // only do this check if the experimental behavior is enabled, otherwise allow it to default to false
    if kl.experimentalHostUserNamespaceDefaulting {
        opts.EnableHostUserNamespace = kl.enableHostUserNamespace(pod)
    }

    return opts, cleanupAction, nil
}

GenerateRunContainerOptions函数涉及几个比较重要的调用,我们逐一解析:

  • makeBlockVolumes(): 解析pod内容器定义的原生块设备
  • makeMounts(): 解析pod内容器定义的卷挂载

makeBlockVolumes()函数解析

首先我们先介绍下块设备与kubernetes下原生的块设备使用

什么是块设备?

块设备允许对固定大小的块中的数据进行随机访问。硬盘驱动器、SSDCD-ROM驱动器都是块设备的例子。

通常,持久性存储是在通过在块设备(例如磁盘或SSD)之上构造文件系统(例如ext4)的分层方式实现的。 这样应用程序就可以读写文件而不是直接操作数据块。操作系统负责使用指定的文件系统将文件读写转换为对底层设备的数据块读写。

值得注意的是,整个磁盘都是块设备,磁盘分区也是如此,存储区域网络(SAN)设备中的LUN也是一样的。

什么场景需要使用原生的块设备呢?

有些特殊的应用程序需要直接访问块设备,原因例如,文件系统层会引入不必要的开销。最常见的情况是数据库,通常会直接在底层存储上组织数据。 原生的块设备(Raw Block Devices)还通常由能自己实现某种存储服务的软件(软件定义的存储系统)使用。

从程序员的角度来看,块设备是一个非常大的字节数组,具有某种最小读写粒度,通常为512个字节,大部分情况为4K或更大。

随着在Kubernetes中运行数据库软件和存储基础架构软件变得越来越普遍,在Kubernetes中支持原生块设备的需求变得越来越重要。

首先我们先通过以下例子,了解原生块设备使用方式。(CSI插件需要支持块设备创建)

原生块设备使用样例

  1. 创建pvc,其中kubernetes-csi-rbd-scceph rbd类型
$ cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: my-pvc
spec:
  accessModes:
    - ReadWriteMany
  volumeMode: Block
  storageClassName: kubernetes-csi-rbd-sc
  resources:
    requests:
      storage: 1Gi
EOF
  1. pod定义使用my-pvc
$ cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: my-pod
spec:
  containers:
    - name: my-container
      image: busybox
      command:
        - sleep
        - "3600"
      volumeDevices:
        - devicePath: /dev/block
          name: my-volume
      imagePullPolicy: IfNotPresent
  volumes:
    - name: my-volume
      persistentVolumeClaim:
        claimName: my-pvc
EOF
  1. 进入pod内查看块设备
$ kubectl exec -it my-pod -- sh
/ # ls -l /dev/block
brwxrwxrwx    1 root     disk      252, 352 Nov 22 05:48 /dev/block

我们发现容器内部确实为块设备类型,这里介绍原生块设备是为了下面对其讨论。

接下来我们分析下,makeBlockVolumes()函数具体做了哪些操作:

makeBlockVolumes()函数源码

kubernetes\pkg\kubelet\kubelet_pods.go

func (kl *Kubelet) makeBlockVolumes(pod *v1.Pod, container *v1.Container, podVolumes kubecontainer.VolumeMap, blkutil volumepathhandler.BlockVolumePathHandler) ([]kubecontainer.DeviceInfo, error) {
    var devices []kubecontainer.DeviceInfo
    for _, device := range container.VolumeDevices {
        // check path is absolute
        if !filepath.IsAbs(device.DevicePath) {
            return nil, fmt.Errorf("error DevicePath `%s` must be an absolute path", device.DevicePath)
        }
        vol, ok := podVolumes[device.Name]
        if !ok || vol.BlockVolumeMapper == nil {
            klog.Errorf("Block volume cannot be satisfied for container %q, because the volume is missing or the volume mapper is nil: %+v", container.Name, device)
            return nil, fmt.Errorf("cannot find volume %q to pass into container %q", device.Name, container.Name)
        }
        // Get a symbolic link associated to a block device under pod device path
        dirPath, volName := vol.BlockVolumeMapper.GetPodDeviceMapPath()
        symlinkPath := path.Join(dirPath, volName)
        if islinkExist, checkErr := blkutil.IsSymlinkExist(symlinkPath); checkErr != nil {
            return nil, checkErr
        } else if islinkExist {
            // Check readOnly in PVCVolumeSource and set read only permission if it's true.
            permission := "mrw"
            if vol.ReadOnly {
                permission = "r"
            }
            klog.V(4).Infof("Device will be attached to container %q. Path on host: %v", container.Name, symlinkPath)
            devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: symlinkPath, PathInContainer: device.DevicePath, Permissions: permission})
        }
    }

    return devices, nil
}

makeBlockVolumes()函数主要遍历容器内声明的原生块设备列表,并执行以下操作:

  1. 判断声明的块设备挂载路径(devicePath)是否为绝对路径(如: /dev/block),如果非绝对路径(block)返回异常。
  2. 判断卷组(spec.volumes)内是否含有该设备的pvc(卷声明)
  3. 映射主机上块设备路径与容器内路径:

块设备链接如下:

$ ls -l /var/lib/kubelet/pods/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb/volumeDevices/kubernetes.io~csi/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1
lrwxrwxrwx 1 root root 142 Nov 22 13:48 /var/lib/kubelet/pods/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb/volumeDevices/kubernetes.io~csi/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1 -> /var/lib/kubelet/plugins/kubernetes.io/csi/volumeDevices/publish/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb
$ ls -l /var/lib/kubelet/plugins/kubernetes.io/csi/volumeDevices/publish/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb
brwxrwxrwx 1 root disk 252, 352 Nov 22 13:48 /var/lib/kubelet/plugins/kubernetes.io/csi/volumeDevices/publish/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb

格式为:

src: /{kubelet data dir}/pods/{podUid}/{DefaultKubeletVolumeDevicesDirName}/{escapeQualifiedPluginName}/, {volumeName}
dst: /var/lib/kubelet/plugins/kubernetes.io/{PluginName}/{DefaultKubeletVolumeDevicesDirName}/{volumePluginDependentPath}/{pod uuid}

我们可以在宿主机对该块设备操作:

$ fdisk  /var/lib/kubelet/plugins/kubernetes.io/csi/volumeDevices/publish/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb
Welcome to fdisk (util-linux 2.23.2).

Changes will remain in memory only, until you decide to write them.
Be careful before using the write command.


Command (m for help): n
Partition type:
   p   primary (0 primary, 0 extended, 4 free)
   e   extended
Select (default p): p
Partition number (1-4, default 1): 1
First sector (2048-2097151, default 2048):
Using default value 2048
Last sector, +sectors or +size{K,M,G} (2048-2097151, default 2097151):
Using default value 2097151
Partition 1 of type Linux and of size 1023 MiB is set

Command (m for help): w
The partition table has been altered!

Calling ioctl() to re-read partition table.
Syncing disks.

注意:这里只初始化RunContainerOptions.Devices的值(检测容器中.volumeDevices字段的合法性,并赋值给RunContainerOptions.Devices数组),并不会执行具体操作。

思考一个问题:为什么会在pod宿主机上,创建出一个块设备?

由于没有看过CSI的源码,这里推测:这个块设备应该是起到桥梁的作用(桥接CSI Agentpod容器内的原生块设备),本质为链接而非实体,块设备实体由CSI管理。

对应关系可能如下:

  • pod内容器的原生块设备 -> pod宿主机上kubelet数据目录下创建块设备 <- CSI

makeMounts()函数解析

该函数,包含一个cleanupAction返回值,该函数为清理卷的subPath函数,容器启动阶段该值为空。该返回值即为上文m.runtimeHelper.GenerateRunContainerOptions()函数调用的返回值。

主要逻辑为遍历pod下容器的卷,执行以下操作:

  1. 判断pod下卷组是否含有容器所定义的卷,不存在返回异常
  2. 如果卷支持SELinux,并且它还没有被重新标记,而且它不是只读卷,重新标记它并将其标记为已标记卷
  3. 判断卷的挂载路径(volumeMounts.mountPath)是否为空,为空的话返回异常
  4. 解析volumeMounts.subPathvolumeMounts.subPathExpr(同一个卷只能存在其中一个字段,否则异常返回):
  5. 当卷volumeMounts.subPathExpr不为空时,需开启VolumeSubpathVolumeSubpathEnvExpansion特性门控(v1.18.6默认开启),否则返回异常
  6. 当卷volumeMounts.subPath不为空时,需开启VolumeSubpath特性门控(v1.18.6默认开启),否则返回异常
  7. 当卷volumeMounts.subPath不为空时,值不能为绝对路径,否则返回异常
  8. 当卷volumeMounts.subPath不为空时,值不能包含..(如: /opt/../root/1.yaml),否则返回异常
  9. 当卷volumeMounts.subPath不为空时,拼接挂载路径值(如:volumeMounts.mountPath/optvolumeMounts.subPath/opt/1.yaml,最终挂载路径为/opt/1.yaml),容器内不能存在该路径(/opt/1.yaml),否则返回异常
  10. 解析podspec.hostAliases数组,写入容器的/etc/hosts内。如果该Pod使用主机网络命名空间,主机的/etc/hosts内容也将写入容器的/etc/hosts

值得注意的是NSA&CISA发布的Kubernetes加固指南 认为子路径存在安全隐患,不建议使用

源码实现

``

// makeMounts determines the mount points for the given container.
func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, hostDomain string, podIPs []string, podVolumes kubecontainer.VolumeMap, hu hostutil.HostUtils, subpather subpath.Interface, expandEnvs []kubecontainer.EnvVar) ([]kubecontainer.Mount, func(), error) {
    // Kubernetes only mounts on /etc/hosts if:
    // - container is not an infrastructure (pause) container
    // - container is not already mounting on /etc/hosts
    // - OS is not Windows
    // Kubernetes will not mount /etc/hosts if:
    // - when the Pod sandbox is being created, its IP is still unknown. Hence, PodIP will not have been set.
    mountEtcHostsFile := len(podIPs) > 0 && runtime.GOOS != "windows"
    klog.V(3).Infof("container: %v/%v/%v podIPs: %q creating hosts mount: %v", pod.Namespace, pod.Name, container.Name, podIPs, mountEtcHostsFile)
    mounts := []kubecontainer.Mount{}
    var cleanupAction func()
    for i, mount := range container.VolumeMounts {
        // do not mount /etc/hosts if container is already mounting on the path
        mountEtcHostsFile = mountEtcHostsFile && (mount.MountPath != etcHostsPath)
        vol, ok := podVolumes[mount.Name]
        if !ok || vol.Mounter == nil {
            klog.Errorf("Mount cannot be satisfied for container %q, because the volume is missing (ok=%v) or the volume mounter (vol.Mounter) is nil (vol=%+v): %+v", container.Name, ok, vol, mount)
            return nil, cleanupAction, fmt.Errorf("cannot find volume %q to mount into container %q", mount.Name, container.Name)
        }

        relabelVolume := false
        // If the volume supports SELinux and it has not been
        // relabeled already and it is not a read-only volume,
        // relabel it and mark it as labeled
        if vol.Mounter.GetAttributes().Managed && vol.Mounter.GetAttributes().SupportsSELinux && !vol.SELinuxLabeled {
            vol.SELinuxLabeled = true
            relabelVolume = true
        }
        hostPath, err := volumeutil.GetPath(vol.Mounter)
        if err != nil {
            return nil, cleanupAction, err
        }

        subPath := mount.SubPath
        if mount.SubPathExpr != "" {
            if !utilfeature.DefaultFeatureGate.Enabled(features.VolumeSubpath) {
                return nil, cleanupAction, fmt.Errorf("volume subpaths are disabled")
            }

            if !utilfeature.DefaultFeatureGate.Enabled(features.VolumeSubpathEnvExpansion) {
                return nil, cleanupAction, fmt.Errorf("volume subpath expansion is disabled")
            }

            subPath, err = kubecontainer.ExpandContainerVolumeMounts(mount, expandEnvs)

            if err != nil {
                return nil, cleanupAction, err
            }
        }

        if subPath != "" {
            if !utilfeature.DefaultFeatureGate.Enabled(features.VolumeSubpath) {
                return nil, cleanupAction, fmt.Errorf("volume subpaths are disabled")
            }

            if filepath.IsAbs(subPath) {
                return nil, cleanupAction, fmt.Errorf("error SubPath `%s` must not be an absolute path", subPath)
            }

            err = volumevalidation.ValidatePathNoBacksteps(subPath)
            if err != nil {
                return nil, cleanupAction, fmt.Errorf("unable to provision SubPath `%s`: %v", subPath, err)
            }

            volumePath := hostPath
            hostPath = filepath.Join(volumePath, subPath)

            if subPathExists, err := hu.PathExists(hostPath); err != nil {
                klog.Errorf("Could not determine if subPath %s exists; will not attempt to change its permissions", hostPath)
            } else if !subPathExists {
                // Create the sub path now because if it's auto-created later when referenced, it may have an
                // incorrect ownership and mode. For example, the sub path directory must have at least g+rwx
                // when the pod specifies an fsGroup, and if the directory is not created here, Docker will
                // later auto-create it with the incorrect mode 0750
                // Make extra care not to escape the volume!
                perm, err := hu.GetMode(volumePath)
                if err != nil {
                    return nil, cleanupAction, err
                }
                if err := subpather.SafeMakeDir(subPath, volumePath, perm); err != nil {
                    // Don't pass detailed error back to the user because it could give information about host filesystem
                    klog.Errorf("failed to create subPath directory for volumeMount %q of container %q: %v", mount.Name, container.Name, err)
                    return nil, cleanupAction, fmt.Errorf("failed to create subPath directory for volumeMount %q of container %q", mount.Name, container.Name)
                }
            }
            hostPath, cleanupAction, err = subpather.PrepareSafeSubpath(subpath.Subpath{
                VolumeMountIndex: i,
                Path:             hostPath,
                VolumeName:       vol.InnerVolumeSpecName,
                VolumePath:       volumePath,
                PodDir:           podDir,
                ContainerName:    container.Name,
            })
            if err != nil {
                // Don't pass detailed error back to the user because it could give information about host filesystem
                klog.Errorf("failed to prepare subPath for volumeMount %q of container %q: %v", mount.Name, container.Name, err)
                return nil, cleanupAction, fmt.Errorf("failed to prepare subPath for volumeMount %q of container %q", mount.Name, container.Name)
            }
        }

        // Docker Volume Mounts fail on Windows if it is not of the form C:/
        if volumeutil.IsWindowsLocalPath(runtime.GOOS, hostPath) {
            hostPath = volumeutil.MakeAbsolutePath(runtime.GOOS, hostPath)
        }

        containerPath := mount.MountPath
        // IsAbs returns false for UNC path/SMB shares/named pipes in Windows. So check for those specifically and skip MakeAbsolutePath
        if !volumeutil.IsWindowsUNCPath(runtime.GOOS, containerPath) && !filepath.IsAbs(containerPath) {
            containerPath = volumeutil.MakeAbsolutePath(runtime.GOOS, containerPath)
        }

        propagation, err := translateMountPropagation(mount.MountPropagation)
        if err != nil {
            return nil, cleanupAction, err
        }
        klog.V(5).Infof("Pod %q container %q mount %q has propagation %q", format.Pod(pod), container.Name, mount.Name, propagation)

        mustMountRO := vol.Mounter.GetAttributes().ReadOnly

        mounts = append(mounts, kubecontainer.Mount{
            Name:           mount.Name,
            ContainerPath:  containerPath,
            HostPath:       hostPath,
            ReadOnly:       mount.ReadOnly || mustMountRO,
            SELinuxRelabel: relabelVolume,
            Propagation:    propagation,
        })
    }
    if mountEtcHostsFile {
        hostAliases := pod.Spec.HostAliases
        hostsMount, err := makeHostsMount(podDir, podIPs, hostName, hostDomain, hostAliases, pod.Spec.HostNetwork)
        if err != nil {
            return nil, cleanupAction, err
        }
        mounts = append(mounts, *hostsMount)
    }
    return mounts, cleanupAction, nil
}

总结

该阶段主要做以下操作:

  1. 生成创建容器所需配置
  2. 根据镜像名称,调用容器运行时,获取运行容器启动命令的用户
  3. 检测运行容器启动命令的用户判是否违反pod安全上下文设置(runAsNonRoot: true时,不允许容器以root用户启动)
  4. 生成日志目录(格式为: /var/log/pods/<pod namespace>_<pod name>_<pod uid>/<容器名称>
  5. 针对windows平台,定义额外配置
  6. 定义容器内的环境变量
  7. 组装配置项并返回

参考文章

Raw Block Volume 支持进入 Beta

Copyright © weiliang 2021 all right reserved,powered by Gitbook本书发布时间: 2024-04-22 16:03:41

results matching ""

    No results matching ""