openstack 云主机性能监控 openstack云主机创建

转载

锦绣前程未央 2023-09-04 09:29:12

文章标签 openstack 云主机性能监控 json bc API 文章分类 OpenStack 云计算

Nova API的实现，以创建云主机过程分析

1. 首先我们要清楚，为nova添加一个新的api是很简单的，因为上面的那一套HTTP请求不需要我们管，只是照着来写就行了，
主要是你的api内的自己的逻辑；

2. 所有的api文件都是放在目录'nova/api/openstack/compute'下，M版本的代码没有了目录'nova/api/openstack/compute/contrib'

3. 以创建云主机来分析：

REQ: curl -i 'http://192.168.1.9:8774/v2/14fd316568bc4f6992ba161fd4e23001/servers' -X POST -H "X-Auth-Project-Id: 14fd316568bc4f6992ba161fd4e23001" 
-H "User-Agent: python-novaclient" -H "Content-Type: application/json" -H "Accept: application/json" -H "X-Auth-Token: 3b9da36bfcf4491f88994b25c3045f95" 
-d '{"server": {"name": "createVm", "imageRef": "db281a42-df86-4a3d-a8df-1af7b2eb80bc", "availability_zone": "nova", "key_name": "keyName", 
"flavorRef": "8", "max_count": 1, "min_count": 1, "networks": [{"port": "406b40b4-6b66-47d9-a943-9e341aeae6ef"}]}}'

3.1 根据这个请求，nova-api服务监听到后，可以很快的定位到文件'nova/api/openstack/compute/servers.py'

# vim nova/api/openstack/compute/servers.py
  class ServersController(wsgi.Controller):
    """The Server API base controller class for the OpenStack API."""

    # 这些命名空间(对应于文件setup.cfg内定义的方法)会作为扩展api的形式被加载进来，下面的代码通过stevedore.enabled.EnabledExtensionManager实现的
    EXTENSION_CREATE_NAMESPACE = 'nova.api.v21.extensions.server.create'

    EXTENSION_REBUILD_NAMESPACE = 'nova.api.v21.extensions.server.rebuild'

    EXTENSION_UPDATE_NAMESPACE = 'nova.api.v21.extensions.server.update'

    EXTENSION_RESIZE_NAMESPACE = 'nova.api.v21.extensions.server.resize'

    # NOTE：在目录'nova/api/openstack/compute'下有两个文件夹'schemas'/'views'，这两个文件夹下的内容将会在创建云主机的时候初始化
    _view_builder_class = views_servers.ViewBuilderV21

    schema_server_create = schema_servers.base_create
    schema_server_update = schema_servers.base_update
    schema_server_rebuild = schema_servers.base_rebuild
    schema_server_resize = schema_servers.base_resize

    schema_server_create_v20 = schema_servers.base_create_v20
    schema_server_update_v20 = schema_servers.base_update_v20
    schema_server_rebuild_v20 = schema_servers.base_rebuild_v20

    schema_server_create_v219 = schema_servers.base_create_v219
    schema_server_update_v219 = schema_servers.base_update_v219
    schema_server_rebuild_v219 = schema_servers.base_rebuild_v219

3.2 进入具体的方法内

@wsgi.response(202)
    @extensions.expected_errors((400, 403, 409, 413))
    @validation.schema(schema_server_create_v20, '2.0', '2.0')
    @validation.schema(schema_server_create, '2.1', '2.18')
    @validation.schema(schema_server_create_v219, '2.19')
    def create(self, req, body):
        """Creates a new server for a given user."""
    前面进行一些操作的策略权限的判断主要是和文件(policy.json)，部分参数的读取；
    然后进入方法：
    (instances, resv_id) = self.compute_api.create(context,
                    inst_type,
                    image_uuid,
                    display_name=name,
                    display_description=description,
                    availability_zone=availability_zone,
                    forced_host=host, forced_node=node,
                    metadata=server_dict.get('metadata', {}),
                    admin_password=password,
                    requested_networks=requested_networks,
                    check_server_group_quota=True,
                    **create_kwargs)

3.3 进入文件‘nova/compute/api.py’

# 这个hooks很有意思，你可以用它来做很多的事情
    @hooks.add_hook("create_instance")
    def create(self, context, instance_type,
               image_href, kernel_id=None, ramdisk_id=None,
               min_count=None, max_count=None,
               display_name=None, display_description=None,
               key_name=None, key_data=None, security_group=None,
               availability_zone=None, forced_host=None, forced_node=None,
               user_data=None, metadata=None, injected_files=None,
               admin_password=None, block_device_mapping=None,
               access_ip_v4=None, access_ip_v6=None, requested_networks=None,
               config_drive=None, auto_disk_config=None, scheduler_hints=None,
               legacy_bdm=True, shutdown_terminate=False,
               check_server_group_quota=False):
        """Provision instances, sending instance information to the
        scheduler.  The scheduler will determine where the instance(s)
        go and will handle creating the DB entries.

        Returns a tuple of (instances, reservation_id)
        """
        # 为什么又进行一次策略的检查？
        # Check policies up front to fail before performing more expensive work
        self._check_create_policies(context, availability_zone,
                requested_networks, block_device_mapping, forced_host,
                forced_node)

        if requested_networks and max_count > 1:
            self._check_multiple_instances_and_specified_ip(requested_networks)
            if utils.is_neutron():
                self._check_multiple_instances_neutron_ports(
                    requested_networks)

        if availability_zone:
            available_zones = availability_zones.\
                get_availability_zones(context.elevated(), True)
            if forced_host is None and availability_zone not in \
                    available_zones:
                msg = _('The requested availability zone is not available')
                raise exception.InvalidRequest(msg)

        # 生成一个主机过滤的字典
        filter_properties = scheduler_utils.build_filter_properties(
                scheduler_hints, forced_host, forced_node, instance_type)

        # 进入方法'def _create_instance()'
        return self._create_instance(
                       context, instance_type,
                       image_href, kernel_id, ramdisk_id,
                       min_count, max_count,
                       display_name, display_description,
                       key_name, key_data, security_group,
                       availability_zone, user_data, metadata,
                       injected_files, admin_password,
                       access_ip_v4, access_ip_v6,
                       requested_networks, config_drive,
                       block_device_mapping, auto_disk_config,
                       filter_properties=filter_properties,
                       legacy_bdm=legacy_bdm,
                       shutdown_terminate=shutdown_terminate,
                       check_server_group_quota=check_server_group_quota)

3.4 进入方法'def _create_instance()'

这个方法的主要作用是：
    def _create_instance(self, context, instance_type,
               image_href, kernel_id, ramdisk_id,
               min_count, max_count,
               display_name, display_description,
               key_name, key_data, security_groups,
               availability_zone, user_data, metadata, injected_files,
               admin_password, access_ip_v4, access_ip_v6,
               requested_networks, config_drive,
               block_device_mapping, auto_disk_config, filter_properties,
               reservation_id=None, legacy_bdm=True, shutdown_terminate=False,
               check_server_group_quota=False):
        """Verify all the input parameters regardless of the provisioning
        strategy being performed and schedule the instance(s) for
        creation.
        """
......
        self.compute_task_api.build_instances(context,
                instances=instances, image=boot_meta,
                filter_properties=filter_properties,
                admin_password=admin_password,
                injected_files=injected_files,
                requested_networks=requested_networks,
                security_groups=security_groups,
                block_device_mapping=block_device_mapping,
                legacy_bdm=False)

3.5 调用方法

self.compute_task_api.build_instances(context,
        instances=instances, image=boot_meta,
        filter_properties=filter_properties,
        admin_password=admin_password,
        injected_files=injected_files,
        requested_networks=requested_networks,
        security_groups=security_groups,
        block_device_mapping=block_device_mapping,
        legacy_bdm=False)

3.5.1 我们先来看看'self.compute_task_api'

根据代码流程可以很快的知道它调到文件'nova/conductor/rpcapi.py'内的类class ComputeTaskAPI(object):
先看它的初始化函数:

def __init__(self):
          super(ComputeTaskAPI, self).__init__()
          target = messaging.Target(topic=CONF.conductor.topic,
                                    namespace='compute_task',
                                    version='1.0')
          serializer = objects_base.NovaObjectSerializer()
          self.client = rpc.get_client(target, serializer=serializer)

引入了一个新的模块'oslo_messaging'，利用了rpc消息队列处理，这些东西这里就不展开将了，下次可以专门开一篇博客将，

我们主要要知道的是，这里主要是通过利用包装后的rpc消息队列(rabbitMQ/ZeroMQ/...)来各个服务之间的通信

3.6 现在进入文件"nova/conductor/api.py"，进入函数:

def build_instances(self, context, instances, image, filter_properties,
            admin_password, injected_files, requested_networks,
            security_groups, block_device_mapping, legacy_bdm=True):
        self.conductor_compute_rpcapi.build_instances(context,
                instances=instances, image=image,
                filter_properties=filter_properties,
                admin_password=admin_password, injected_files=injected_files,
                requested_networks=requested_networks,
                security_groups=security_groups,
                block_device_mapping=block_device_mapping,
                legacy_bdm=legacy_bdm)

3.7 进入文件'nova/conductor/rpcapi.py'，进入函数：

def build_instances(self, context, instances, image, filter_properties,
            admin_password, injected_files, requested_networks,
            security_groups, block_device_mapping, legacy_bdm=True):
        image_p = jsonutils.to_primitive(image)
        version = '1.10'
        if not self.client.can_send_version(version):
            version = '1.9'
            if 'instance_type' in filter_properties:
                flavor = filter_properties['instance_type']
                flavor_p = objects_base.obj_to_primitive(flavor)
                filter_properties = dict(filter_properties,
                                         instance_type=flavor_p)
        kw = {'instances': instances, 'image': image_p,
               'filter_properties': filter_properties,
               'admin_password': admin_password,
               'injected_files': injected_files,
               'requested_networks': requested_networks,
               'security_groups': security_groups}
        if not self.client.can_send_version(version):
            version = '1.8'
            kw['requested_networks'] = kw['requested_networks'].as_tuples()
        if not self.client.can_send_version('1.7'):
            version = '1.5'
            bdm_p = objects_base.obj_to_primitive(block_device_mapping)
            kw.update({'block_device_mapping': bdm_p,
                       'legacy_bdm': legacy_bdm})

        # 前面主要是进行版本的判断和少量的参数重组
        # 下面这两行才是关键
        # 准备rpc服务
        # 调用cast方法，这是一个异步任务，其中参数'build_instances'表示消息接受方的方法
        cctxt = self.client.prepare(version=version)
        cctxt.cast(context, 'build_instances', **kw)

3.8 进入文件'nova/conductor/manager.py'，调用函数：

def build_instances(self, context, instances, image, filter_properties,
            admin_password, injected_files, requested_networks,
            security_groups, block_device_mapping=None, legacy_bdm=True):
        # TODO(ndipanov): Remove block_device_mapping and legacy_bdm in version
        #                 2.0 of the RPC API.
        # TODO(danms): Remove this in version 2.0 of the RPC API
        if (requested_networks and 
                not isinstance(requested_networks,
                               objects.NetworkRequestList)):
            requested_networks = objects.NetworkRequestList(
                objects=[objects.NetworkRequest.from_tuple(t)
                         for t in requested_networks])
        # TODO(melwitt): Remove this in version 2.0 of the RPC API
        flavor = filter_properties.get('instance_type')
        if flavor and not isinstance(flavor, objects.Flavor):
            # Code downstream may expect extra_specs to be populated since it
            # is receiving an object, so lookup the flavor to ensure this.
            flavor = objects.Flavor.get_by_id(context, flavor['id'])
            filter_properties = dict(filter_properties, instance_type=flavor)

        request_spec = {}
        try:
            # check retry policy. Rather ugly use of instances[0]...
            # but if we've exceeded max retries... then we really only
            # have a single instance.

            # 为过滤做一些基本判断和参数的组合
            scheduler_utils.populate_retry(
                filter_properties, instances[0].uuid)
            request_spec = scheduler_utils.build_request_spec(
                    context, image, instances)
            # 结果一系列的过滤条件，得到符合条件的物理主机，用来启动云主机
            # 为每一台云主机分配一个物理主机
            # 具体的过滤规则会另外开一篇博客讲解
            hosts = self._schedule_instances(
                    context, request_spec, filter_properties)
        except Exception as exc:
            updates = {'vm_state': vm_states.ERROR, 'task_state': None}
            for instance in instances:
                self._set_vm_state_and_notify(
                    context, instance.uuid, 'build_instances', updates,
                    exc, request_spec)
                self._cleanup_allocated_networks(
                    context, instance, requested_networks)
            return

        # 循环创建云主机
        # 个人感觉这里会有一些问题，如果数量很多的话，这个循环就相当于排队执行，会不会耗时间，可以在这里加一些策略
        # 但是仔细想想，因为你调用libvirt的api它就是一个一个的来的，所以循环还是得要，所以可以考虑对相同的hosts的云主机进行优化
        for (instance, host) in six.moves.zip(instances, hosts):
            try:
                instance.refresh()
            except (exception.InstanceNotFound,
                    exception.InstanceInfoCacheNotFound):
                LOG.debug('Instance deleted during build', instance=instance)
                continue
            local_filter_props = copy.deepcopy(filter_properties)
            scheduler_utils.populate_filter_properties(local_filter_props,
                host)
            # The block_device_mapping passed from the api doesn't contain
            # instance specific information
            bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                    context, instance.uuid)

            # host--表示你要往哪台物理主机上启动这个云主机，主要是通过rpc发送过去的
            self.compute_rpcapi.build_and_run_instance(context,
                    instance=instance, host=host['host'], image=image,
                    request_spec=request_spec,
                    filter_properties=local_filter_props,
                    admin_password=admin_password,
                    injected_files=injected_files,
                    requested_networks=requested_networks,
                    security_groups=security_groups,
                    block_device_mapping=bdms, node=host['nodename'],
                    limits=host['limits'])

3.9 现在进入文件'nova/compute/rpcapi.py'，找到函数：

def build_and_run_instance(self, ctxt, instance, host, image, request_spec,
            filter_properties, admin_password=None, injected_files=None,
            requested_networks=None, security_groups=None,
            block_device_mapping=None, node=None, limits=None):

        version = '4.0'
        # 准备rpc客户端，进行rpc消息队列的调用，server参数很重要，它表示这个动作会往哪台物理主机上发送
        # 说白了就是这个云主机将会在哪台物理主机上启动
        cctxt = self.client.prepare(server=host, version=version)
        # 异步的方法，这里必须使用异步的方法，不然前面的循环就会成为鸡肋
        cctxt.cast(ctxt, 'build_and_run_instance', instance=instance,
                image=image, request_spec=request_spec,
                filter_properties=filter_properties,
                admin_password=admin_password,
                injected_files=injected_files,
                requested_networks=requested_networks,
                security_groups=security_groups,
                block_device_mapping=block_device_mapping, node=node,
                limits=limits)

3.10 进入文件'nova/compute/manager.py'，调用函数：

@wrap_exception()
    @reverts_task_state
    @wrap_instance_fault
    def build_and_run_instance(self, context, instance, image, request_spec,
                     filter_properties, admin_password=None,
                     injected_files=None, requested_networks=None,
                     security_groups=None, block_device_mapping=None,
                     node=None, limits=None):

        @utils.synchronized(instance.uuid)
        def _locked_do_build_and_run_instance(*args, **kwargs):
            # NOTE(danms): We grab the semaphore with the instance uuid
            # locked because we could wait in line to build this instance
            # for a while and we want to make sure that nothing else tries
            # to do anything with this instance while we wait.
            with self._build_semaphore:
                self._do_build_and_run_instance(*args, **kwargs)

        # NOTE(danms): We spawn here to return the RPC worker thread back to
        # the pool. Since what follows could take a really long time, we don't
        # want to tie up RPC workers.
 
        # 这里主要是使用了python的第三方库eventlet，以实现高并发
        # 函数的第一个参数就是它将要执行的func
        utils.spawn_n(_locked_do_build_and_run_instance,
                      context, instance, image, request_spec,
                      filter_properties, admin_password, injected_files,
                      requested_networks, security_groups,
                      block_device_mapping, node, limits)

3.11 接下来进入方法:

@hooks.add_hook('build_instance')
    @wrap_exception()
    @reverts_task_state
    @wrap_instance_event
    @wrap_instance_fault
    def _do_build_and_run_instance(self, context, instance, image,
            request_spec, filter_properties, admin_password, injected_files,
            requested_networks, security_groups, block_device_mapping,
            node=None, limits=None):
        ......
        try:
            with timeutils.StopWatch() as timer:
                # 接下来会进入此方法
                self._build_and_run_instance(context, instance, image,
                        decoded_files, admin_password, requested_networks,
                        security_groups, block_device_mapping, node, limits,
                        filter_properties)
            LOG.info(_LI('Took %0.2f seconds to build instance.'),
                     timer.elapsed(), instance=instance)
            return build_results.ACTIVE

3.12 进入方法：

def _build_and_run_instance(self, context, instance, image, injected_files,
            admin_password, requested_networks, security_groups,
            block_device_mapping, node, limits, filter_properties):

        image_name = image.get('name')
        # notify，事件通知，使用了rpc的消息机制
        self._notify_about_instance_usage(context, instance, 'create.start',
                extra_usage_info={'image_name': image_name})
        try:
            # 获取参数node主机上的可用资源
            rt = self._get_resource_tracker(node)
            # 在启动之前就先得到主机上的资源
            with rt.instance_claim(context, instance, limits):
                # NOTE(russellb) It's important that this validation be done
                # *after* the resource tracker instance claim, as that is where
                # the host is set on the instance.
                
                # 之前在做过滤scheduler的时候，引入了，所以这个时候要对主机在参数filter_properties
                # 内做判断
                self._validate_instance_group_policy(context, instance,
                        filter_properties)
                image_meta = objects.ImageMeta.from_dict(image)
                # 建立云主机所需的资源
                # 其中包括网络，bdm
                with self._build_resources(context, instance,
                        requested_networks, security_groups, image_meta,
                        block_device_mapping) as resources:
                    instance.vm_state = vm_states.BUILDING
                    instance.task_state = task_states.SPAWNING
                    # NOTE(JoshNang) This also saves the changes to the
                    # instance from _allocate_network_async, as they aren't
                    # saved in that function to prevent races.
                    instance.save(expected_task_state=
                            task_states.BLOCK_DEVICE_MAPPING)
                    block_device_info = resources['block_device_info']
                    network_info = resources['network_info']
                    LOG.debug('Start spawning the instance on the hypervisor.',
                              instance=instance)
                    with timeutils.StopWatch() as timer:
                        # self.driver表示用的是哪种类型的后端虚拟化技术
                        # 可以通过配置文件来进行配置
                        # 现在跳到3.13
                        self.driver.spawn(context, instance, image_meta,
                                          injected_files, admin_password,
                                          network_info=network_info,
                                          block_device_info=block_device_info)
                    LOG.info(_LI('Took %0.2f seconds to spawn the instance on '
                                 'the hypervisor.'), timer.elapsed(),
                             instance=instance)
        except ......
            ......
        # If CONF.default_access_ip_network_name is set, grab the
        # corresponding network and set the access ip values accordingly.
        network_name = CONF.default_access_ip_network_name
        if (network_name and not instance.access_ip_v4 and
                not instance.access_ip_v6):
            # Note that when there are multiple ips to choose from, an
            # arbitrary one will be chosen.
            for vif in network_info:
                if vif['network']['label'] == network_name:
                    for ip in vif.fixed_ips():
                        if not instance.access_ip_v4 and ip['version'] == 4:
                            instance.access_ip_v4 = ip['address']
                        if not instance.access_ip_v6 and ip['version'] == 6:
                            instance.access_ip_v6 = ip['address']
                    break

        self._update_instance_after_spawn(context, instance)

        try:
            instance.save(expected_task_state=task_states.SPAWNING)
        except (exception.InstanceNotFound,
                exception.UnexpectedDeletingTaskStateError) as e:
            with excutils.save_and_reraise_exception():
                self._notify_about_instance_usage(context, instance,
                    'create.end', fault=e)

        self._update_scheduler_instance_info(context, instance)
        self._notify_about_instance_usage(context, instance, 'create.end',
                extra_usage_info={'message': _('Success')},
                network_info=network_info)

3.13 进入文件'nova/virt/libvirt/driver.py'，调用方法：

首先我们要知道，它最后肯定调用的是libvirt的api，只是在这里组一个xml文件出来
    # NOTE(ilyaalekseyev): Implementation like in multinics
    # for xenapi(tr3buchet)
    def spawn(self, context, instance, image_meta, injected_files,
              admin_password, network_info=None, block_device_info=None):
        disk_info = blockinfo.get_disk_info(CONF.libvirt.virt_type,
                                            instance,
                                            image_meta,
                                            block_device_info)
        # 创建云主机所需的镜像
        # 期间用到了镜像缓存的小功能
        # 原理很简单，就是先规定一个系统目录，将所需的image都会缓存到此，然后如果第二次基于这个image启动云主机时
        # 就会去这个目录先判断，如果有了，就不去远端下载了
        self._create_image(context, instance,
                           disk_info['mapping'],
                           network_info=network_info,
                           block_device_info=block_device_info,
                           files=injected_files,
                           admin_pass=admin_password)
        # 组合出一个xml文件出来
        # 如果你要修改云主机的xml，可以从这里进入入手做修改
        xml = self._get_guest_xml(context, instance, network_info,
                                  disk_info, image_meta,
                                  block_device_info=block_device_info,
                                  write_to_disk=True)
        # 现在进入这个方法，3.14
        self._create_domain_and_network(context, xml, instance, network_info,
                                        disk_info,
                                        block_device_info=block_device_info)
        LOG.debug("Instance is running", instance=instance)

        def _wait_for_boot():
            """Called at an interval until the VM is running."""
            state = self.get_info(instance).state

            if state == power_state.RUNNING:
                LOG.info(_LI("Instance spawned successfully."),
                         instance=instance)
                raise loopingcall.LoopingCallDone()

        timer = loopingcall.FixedIntervalLoopingCall(_wait_for_boot)
        timer.start(interval=0.5).wait()

3.14 进入函数

def _create_domain_and_network(self, context, xml, instance, network_info,
                                   disk_info, block_device_info=None,
                                   power_on=True, reboot=False,
                                   vifs_already_plugged=False):

        """Do required network setup and create domain."""
        ......
        guest = None
        try:
            with self.virtapi.wait_for_instance_event(
                    instance, events, deadline=timeout,
                    error_callback=self._neutron_failed_callback):
                self.plug_vifs(instance, network_info)
                self.firewall_driver.setup_basic_filtering(instance,
                                                           network_info)
                self.firewall_driver.prepare_instance_filter(instance,
                                                             network_info)
                with self._lxc_disk_handler(instance, instance.image_meta,
                                            block_device_info, disk_info):
                    # 现在进入到3.15
                    guest = self._create_domain(
                        xml, pause=pause, power_on=power_on)

                self.firewall_driver.apply_instance_filter(instance,
                                                           network_info)

3.15 进入函数:‘def _create_domain’

# TODO(sahid): Consider renaming this to _create_guest.
    def _create_domain(self, xml=None, domain=None,
                       power_on=True, pause=False):
        """Create a domain.

        Either domain or xml must be passed in. If both are passed, then
        the domain definition is overwritten from the xml.

        :returns guest.Guest: Guest just created
        """
        if xml:
            # 从这里往下看很快就会看到调用的libvirt的api是'defineXML(xml)'
            # param xml: XML domain definition of the guest.
            # returns: a virDomain instance
            guest = libvirt_guest.Guest.create(xml, self._host)
        else:
            guest = libvirt_guest.Guest(domain)

        if power_on or pause:
            guest.launch(pause=pause)

        if not utils.is_neutron():
            guest.enable_hairpin()

        return guest

3.16 到这里一个云主机的创建步骤就简单的结束了，当然，后期libvirt会通过返回event事件的形式来通知上层云主机的状态

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。