libvirt找不到支持kvm的guest

在物理主机上创建虚拟机报错,不支持domaintype=kvm的虚拟机(偶现)。

error: Failed to create domain from test.xml
error: invalid argument: could not find capabilities for arch=x86_64 domaintype=kvm

主机bios已经打开了支持kvm配置,并且查看kvm model,也已经加载。

问题环境

宿主机系统(host):CentOS Linux release 7.7.1908 (Core)

虚拟化软件版本:

libvirt:5.0.0

qemu:2.12.0

virsh capabilities 分析

执行virsh capabilities查看结果:

 <guest>
    <os_type>hvm</os_type>
    <arch name='x86_64'>
      <wordsize>64</wordsize>
      <emulator>/usr/libexec/qemu-kvm</emulator>
      <machine maxCpus='240'>pc-i440fx-rhel7.6.0</machine>
      <machine canonical='pc-i440fx-rhel7.6.0' maxCpus='240'>pc</machine>
      ...
      <domain type='qemu'/>
      #<domain type='kvm'/> 没有此选项
    </arch>

流程分析:

(gdb) bt
#0  virQEMUCapsInit (cache=0x7f8c4c10ad00) at qemu/qemu_capabilities.c:900
#1  0x00007f8c550c49b0 in virQEMUDriverCreateCapabilities (driver=driver@entry=0x7f8c4c11b0a0) at qemu/qemu_conf.c:1098
#2  0x00007f8c550c4c83 in virQEMUDriverGetCapabilities (driver=0x7f8c4c11b0a0, refresh=<optimized out>) at qemu/qemu_conf.c:1168
#3  0x00007f8c55128443 in qemuConnectGetCapabilities (conn=<optimized out>) at qemu/qemu_driver.c:1434
#4  0x00007f8c823ebec3 in virConnectGetCapabilities (conn=0x7f8c3d8efa50) at libvirt-host.c:403
#5  0x000055e7846f0b97 in remoteDispatchConnectGetCapabilities (server=0x55e785f6f0b0, msg=0x55e785fa5600, ret=0x7f8c3d814b70, rerr=0x7f8c71392c10, client=0x55e785fa5130) at remote/remote_daemon_dispatch_stubs.h:790
#6  remoteDispatchConnectGetCapabilitiesHelper (server=0x55e785f6f0b0, client=0x55e785fa5130, msg=0x55e785fa5600, rerr=0x7f8c71392c10, args=0x7f8c3d8aa390, ret=0x7f8c3d814b70) at remote/remote_daemon_dispatch_stubs.h:769
#7  0x00007f8c82308655 in virNetServerProgramDispatchCall (msg=0x55e785fa5600, client=0x55e785fa5130, server=0x55e785f6f0b0, prog=0x55e785f96610) at rpc/virnetserverprogram.c:435
#8  virNetServerProgramDispatch (prog=0x55e785f96610, server=server@entry=0x55e785f6f0b0, client=0x55e785fa5130, msg=0x55e785fa5600) at rpc/virnetserverprogram.c:302
#9  0x00007f8c8230ee8d in virNetServerProcessMsg (msg=<optimized out>, prog=<optimized out>, client=<optimized out>, srv=0x55e785f6f0b0) at rpc/virnetserver.c:142
#10 virNetServerHandleJob (jobOpaque=<optimized out>, opaque=0x55e785f6f0b0) at rpc/virnetserver.c:163
#11 0x00007f8c8223c3e1 in virThreadPoolWorker (opaque=opaque@entry=0x55e785f4e0e0) at util/virthreadpool.c:163
#12 0x00007f8c8223b768 in virThreadHelper (data=<optimized out>) at util/virthread.c:206
#13 0x00007f8c7fa1ee65 in start_thread () from /lib64/libpthread.so.0
#14 0x00007f8c7f34088d in clone () from /lib64/libc.so.6

其中virQEMUCapsInit(driver->qemuCapsCache),cache是从driver中取得:

在此函数中会进行一些host主机基本信息的获取,然后执行virQEMUCapsInitGuest来获取guest的配置,根据

qemuCaps即 cache来判断是否添加kvm类型的guest os.

//virQEMUCapsInitGuest
/* Ignore binary if extracting version info fails */
    if (binary) {
    //在cache中查看相关的属性值
        if (!(qemuCaps = virQEMUCapsCacheLookup(cache, binary))) {
            virResetLastError();
            VIR_FREE(binary);
        }
    }
    ret = virQEMUCapsInitGuestFromBinary(caps,
                                         binary, qemuCaps,
                                         guestarch);
...
//virQEMUCapsInitGuestFromBinary
//若支持QEMU_CAPS_KVM,则添加,出问题的环境没有添加,问题则出在这里
if (virQEMUCapsGet(qemuCaps, QEMU_CAPS_KVM)) {
        if (virCapabilitiesAddGuestDomain(guest,
                                          VIR_DOMAIN_VIRT_KVM,
                                          NULL,
                                          NULL,
                                          0,
                                          NULL) == NULL) {
            goto cleanup;
        }
    }

cache的初始化

查询过程中可以看到cache就一直存在,查看cache的源头,只能从libvirt初始化的定位查询

Breakpoint 1, virQEMUCapsCacheLookup (cache=cache@entry=0x7fffc0136b50, binary=0x7fffc0155b40 "/usr/libexec/qemu-kvm") at qemu/qemu_capabilities.c:4855
4855	{
(gdb) bt
#0  virQEMUCapsCacheLookup (cache=cache@entry=0x7fffc0136b50, binary=0x7fffc0155b40 "/usr/libexec/qemu-kvm") at qemu/qemu_capabilities.c:4855
#1  0x00007fffcabdd864 in virQEMUCapsInitGuest (guestarch=VIR_ARCH_I686, hostarch=VIR_ARCH_X86_64, cache=0x7fffc0136b50, caps=0x7fffc0154760) at qemu/qemu_capabilities.c:781
#2  virQEMUCapsInit (cache=0x7fffc0136b50) at qemu/qemu_capabilities.c:944
#3  0x00007fffcac2f600 in virQEMUDriverCreateCapabilities (driver=driver@entry=0x7fffc0114190) at qemu/qemu_conf.c:1098
#4  0x00007fffcac791ba in qemuStateInitialize (privileged=true, callback=<optimized out>, opaque=<optimized out>) at qemu/qemu_driver.c:927
#5  0x00007ffff7661f1f in virStateInitialize (privileged=true, callback=callback@entry=0x555555577c30 <daemonInhibitCallback>, opaque=opaque@entry=0x5555557f6720) at libvirt.c:657
#6  0x0000555555577c8b in daemonRunStateInit (opaque=0x5555557f6720) at remote/remote_daemon.c:796
#7  0x00007ffff74d5d82 in virThreadHelper (data=<optimized out>) at util/virthread.c:206
#8  0x00007ffff4a8ae65 in start_thread () from /lib64/libpthread.so.0
#9  0x00007ffff43ac88d in clone () from /lib64/libc.so.6

在libvirt初始化时,会执行virQEMUCapsCacheLookup,进而执行virFileCacheValidate

static void
virFileCacheValidate(virFileCachePtr cache,
                     const char *name,
                     void **data)
{
    //调用驱动isValid函数
    if (*data && !cache->handlers.isValid(*data, cache->priv)) {
        VIR_DEBUG("Cached data '%p' no longer valid for '%s'",
                  *data, NULLSTR(name));
        if (name)
            virHashRemoveEntry(cache->table, name);
        *data = NULL;
    }
    //失效重新生成data
    if (!*data && name) {
        VIR_DEBUG("Creating data for '%s'", name);
        *data = virFileCacheNewData(cache, name);
        if (*data) {
            VIR_DEBUG("Caching data '%p' for '%s'", *data, name);
            if (virHashAddEntry(cache->table, name, *data) < 0) {
                virObjectUnref(*data);
                *data = NULL;
            }
        }
    }
}

cache的生成

static void *
virFileCacheNewData(virFileCachePtr cache,
                    const char *name)
{
    void *data = NULL;
    int rv;
    //从文件缓存中获取,同样会进行isValid校验是否有效
    if ((rv = virFileCacheLoad(cache, name, &data)) < 0)
        return NULL;

    if (rv == 0) {
        //调用驱动newData函数重新生成
        if (!(data = cache->handlers.newData(name, cache->priv)))
            return NULL;

        if (virFileCacheSave(cache, name, data) < 0) {
            virObjectUnref(data);
            data = NULL;
        }
    }

    return data;
}

当初始化libvirt会检查/var/cache/libvirt/qemu/capabilities/*.xml是否有效或者存在,当存在并且有效时,则从文件中加载,否则重新生成,生成的代码块都在virQEMUCapsNewData中,其中virQEMUCapsInitQMP是关键一步

大致过程:

  1. 创建qemu进程

    /usr/libexec/qemu-kvm -S -no-user-config -nodefaults -nographic -machine none,accel=kvm:tcg -qmp unix:/var/lib/libvirt/qemu/capabilities.monitor.sock,server,nowait -pidfile /var/lib/libvirt/qemu/capabilities.pidfile -daemonize
    

    其中accel=kvm:tcg表示采用kvm与tcg加速器,如果kvm支持,优先采用kvm,不支持则qemu全模拟

  2. qmp命令查询capability

    查询执行的命令比较多,功能点查询较多,以查询是否支持kvm为例,采用的qmp命令:

    {"execute":"query-kvm"}
    

    通过返回值来判定是否支持kvm

    static int
    virQEMUCapsProbeQMPKVMState(virQEMUCapsPtr qemuCaps,
                                qemuMonitorPtr mon)
    {
        bool enabled = false;
        bool present = false;
       
        if (qemuMonitorGetKVMState(mon, &enabled, &present) < 0)
            return -1;
       
        if (present && enabled)
            virQEMUCapsSet(qemuCaps, QEMU_CAPS_KVM);
       
        return 0;
    }
    
  3. 当查询完毕时会把进程关闭,做一些清理资源的动作

把各种查询结果保存到cache,并记录文件。

cache系统的抽象

libvirt代码对于底层虚拟化的capabilities做了cache,cache中有一套完整的流程,在virfilecache.c可以看到各个接口中的细节

大致包含如下:

  1. 创建cache

    首先校验文件是否存在,如果存在并且有效(isValid),则从文件加载(loadFile)。否则重新生成cache(newData),并将cache保存到文件(saveFile),方便下次加载。

  2. 查询 cache

    查询cache时,首先会判断cache是否有效(isValid),如果无效则重新走创建cache流程,如果有效,则直接返回对应结果。

当然只有再libvirt对接qemu时才采用这套流程(对接qemu时实现了这套接口),在qemu_capabilities.c中可以看到各个接口具体的实现

virFileCacheHandlers qemuCapsCacheHandlers = {
    .isValid = virQEMUCapsIsValid,
    .newData = virQEMUCapsNewData,
    .loadFile = virQEMUCapsLoadFile,
    .saveFile = virQEMUCapsSaveFile,
    .privFree = virQEMUCapsCachePrivFree,
};

virsh capabilities中返回的项目比较多,当遇到具体的capabilities中某个项时可以根据这套流程,针对性的查看代码细节。

回归问题本身

了解了cache以后就按部就班的定位问题了,qemuMonitorJSONGetKVMState中执行了qpm命令(query-kvm)进行查询,手动模拟libvirt重启时的进程启动过程

/usr/libexec/qemu-kvm -S -no-user-config -nodefaults -nographic -machine none,accel=kvm:tcg -qmp unix:/var/lib/libvirt/qemu/capabilities.monitor.sock,server,nowait -pidfile /var/lib/libvirt/qemu/capabilities.pidfile -daemonize

针对qemu monitor启动的方式不同,有不同的连接方式:

# qemu monitor采用tcp方式,监听在127.0.0.1上,端口为4444
/usr/libexec/qemu-kvm -qmp tcp:127.0.0.1:4444,server,nowait

# qemu monitor采用unix socket,socket文件生成于/opt/qmp.socket
/usr/libexec/qemu-kvm -qmp unix:/opt/qmp.socket,server,nowait

连接qemu monitor:

# tcp可以通过telnet进行连接,方法如下
> telnet 127.0.0.1 1234
Trying 127.0.0.1...
Connected to 127.0.0.1.
Escape character is '^]'.
{"QMP": {"version": {"qemu": {"micro": 0, "minor": 12, "major": 2}, "package": "qemu-kvm-ev-2.12.0-33.1.fh.3.4.el7"}, "capabilities": []}}
# unix socket可以通过nc -U进行连接,方法如下
> nc -U /opt/qmp.socket
{"QMP": {"version": {"qemu": {"micro": 0, "minor": 12, "major": 2}, "package": "qemu-kvm-ev-2.12.0-33.1.fh.3.4.el7"}, "capabilities": []}}

连接后就处于等待状态,但是还不能使用,必须先执行下如下命令,然后再执行其他的qmp命令即可。

{ "execute" : "qmp_capabilities" }

执行命令查看返回值:

{"execute":"query-kvm"}
{"return": {"enabled": false, "present": true}}

返回的enabled为false,导致libvirt认为底层不支持kvm,继续跟踪qemu代码分析定位。

KvmInfo *qmp_query_kvm(Error **errp)
{
    KvmInfo *info = g_malloc0(sizeof(*info));
    //返回kvm_allowed此变量的值,重点在这里
    info->enabled = kvm_enabled();
    //编译时定义了CONFIG_KVM就返回为true
    info->present = kvm_available();

    return info;
}

查看kvm_allowed变量的使用

//kvm_all.c
static void kvm_accel_class_init(ObjectClass *oc, void *data)
{
    AccelClass *ac = ACCEL_CLASS(oc);
    ac->name = "KVM";
    ac->init_machine = kvm_init;
    ac->allowed = &kvm_allowed;
}

kvm_allowed即ac->allowed。

qemu启动过程比较复杂,这里只关注改变量的变化,摘取主要代码:

//accel.c
//accel_init_machine方法在qemu初始化加速器(vl.c中configure_accelerator)时会被调动
static int accel_init_machine(AccelClass *acc, MachineState *ms)
{
    ObjectClass *oc = OBJECT_CLASS(acc);
    const char *cname = object_class_get_name(oc);
    AccelState *accel = ACCEL(object_new(cname));
    int ret;
    ms->accelerator = accel;
    *(acc->allowed) = true;
    //调用加速器的init_machine方法
    ret = acc->init_machine(ms);
    if (ret < 0) {
        ms->accelerator = NULL;
        *(acc->allowed) = false;
        object_unref(OBJECT(accel));
    } else {
        accel_register_compat_props(ms->accelerator);
    }
    return ret;
}

这里只是一个抽象接口,实现都在具体的accel(加速器)中,启动时指定了accel=kvm:tcg设定了两个accel,qemu会优先选择第一个,当第一个失败时会尝试第二个。对于kvm accel来说,即kvm_init函数,在kvm_init函数中

static int kvm_init(MachineState *ms)
{
    MachineClass *mc = MACHINE_GET_CLASS(ms);
    ...
    //打开kvm设备
    s->fd = qemu_open("/dev/kvm", O_RDWR);
    if (s->fd == -1) {
        fprintf(stderr, "Could not access KVM kernel module: %m\n");
        ret = -errno;
        goto err;
    }
    //ioctl获取kvm一些信息
    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
    ...
    do {
        //尝试创建虚拟机
        ret = kvm_ioctl(s, KVM_CREATE_VM, type);
    } while (ret == -EINTR);

如果其中有返回错误,即ret < 0 则将allowed置为false,即返回给libvirt的enabled值为false。

打开libvirt debug级别日志,在日志中可以看到:

21587 Could not access KVM kernel module: Permission denied
21588 qemu-kvm: failed to initialize KVM: Permission denied
21589 qemu-kvm: Back to tcg accelerator

即打开kvm时没有权限。

ll /dev/kvm 
crw-rw-rw-+ 1 root kvm 10, 232 Jul 14 00:32 /dev/kvm

可以看到文件权限最后有个+号,即启用了linux acl控制(可用getfacl与getfacl管理):

getfacl /dev/kvm
getfacl: Removing leading '/' from absolute path names
# file: dev/kvm
# owner: root
# group: kvm
user::rw-
group::---
mask::rw-
other::rw-

可以看到kvm所属的组的权限为空,而qemu属于kvm group所以导致了没有权限。

而/dev/kvm设备是由内核模块kvm_intel加载而来的,ll可以看到kvm的主次设备号为:10,232,查看主设备对应的关系为

cat /proc/devices 
Character devices:
  1 mem
  4 /dev/vc/0
  4 tty
  4 ttyS
  5 /dev/tty
  5 /dev/console
  5 /dev/ptmx
  7 vcs
 10 misc
 13 input
 21 sg
...

即kvm设备是个misc设备,查看内核代码可以看到kvm确实是注册到misc字符设备驱动中的,并在include/linux/miscdevice.h中定义了从设备号#define KVM_MINOR 232

默认加载kvm module 时的权限为:

ll /dev/kvm 
crw------- 1 root root 10, 232 Jul 18 23:06 /dev/kvm

当安装后qemu后,在spec安装脚本中可以看到,对于kvm设备权限做了几件事:

  1. 添加相关组与权限

    getent group kvm >/dev/null || groupadd -g 36 -r kvm
    getent group qemu >/dev/null || groupadd -g 107 -r qemu
    getent passwd qemu >/dev/null || \
    useradd -r -u 107 -g qemu -G kvm -d / -s /sbin/nologin \
      -c "qemu user" qemu
    
  2. 将80-kvm.rules写入udev文件中,这个后续重启自动生效,本地生效参见步骤3,而80-kvm.rules内容为:

     KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"
    

    即把kvm设备的group修改为kvm,并将权限置为666

  3. 然后执行udev更新命令,因为步骤2中已经写入了新的rules,所以加载后生效

    %udev_rules_update
    sh %{_sysconfdir}/sysconfig/modules/kvm.modules &> /dev/null || :
        udevadm trigger --subsystem-match=misc --sysname-match=kvm --action=add || :
    

    当然从rpm包中可以直接看到

    rpm -qp --scripts qemu-kvm-ev-2.12.0-33.1.fh.3.4.el7.x86_64.rpm
    postinstall scriptlet (using /bin/sh):
    # load kvm modules now, so we can make sure no reboot is needed.
    # If there's already a kvm module installed, we don't mess with it
       
    udevadm control --reload >/dev/null 2>&1 || : 
       
    sh /etc/sysconfig/modules/kvm.modules &> /dev/null || :
        udevadm trigger --subsystem-match=misc --sysname-match=kvm --action=add || :
    

从整个过程来说,并没有主动添加acl权限的动作。

到/usr/lib/udev/rules.d目录下寻找与kvm相关的rules;

grep kvm * 
70-uaccess.rules:SUBSYSTEM=="misc", KERNEL=="kvm", TAG+="uaccess"
80-kvm.rules:KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"
81-kvm-rhel.rules:DEVPATH=="*/kvm", ACTION=="change", RUN+="/lib/udev/udev-kvm-check $env{COUNT} $env{EVENT}"

没有看到去掉group相关的权限设置

规避方式

由于没办法重启调试(重启现象消失),所以怀疑是硬件与内核的兼容性不好,采用规避性方案:

根据分析,重新加载kvm module

modprobe -r kvm_intel
modprobe kvm_intel

或者手动执行setfacl将acl规则去掉,当然也可以添加组权限(可以放到qmeu安装post脚本中)

setfacl -b /dev/kvm
or
setfacl -m g::rw /dev/kvm

重新执行virsh capabilities可以看到<domain type=’kvm’/>(不需要重启libvirt,原因是获取cache时,isValid会校验/dev/kvm的属性变化,导致校验失败,会触发重新生成cache)

这里其实只是暂时规避方案,因为发生的概率特别小,不容易复现,真正具体的原因还需要研究内核代码。

参考文档

基于QMP实现对qemu虚拟机进行交互

Table of Contents