日韩av黄I国产麻豆传媒I国产91av视频在线观看I日韩一区二区三区在线看I美女国产在线I麻豆视频国产在线观看I成人黄色短片

歡迎訪問 生活随笔!

生活随笔

當(dāng)前位置: 首頁 >

libvirt numatune 原理

發(fā)布時(shí)間:2023/12/20 41 豆豆
生活随笔 收集整理的這篇文章主要介紹了 libvirt numatune 原理 小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

目錄

numatune是什么

numatune memory

libvirt

內(nèi)核

numatune memnode

qemu

內(nèi)核

alloc page

結(jié)論


numatune是什么

numatune是libvirt的一個(gè)參數(shù),可以用在numa架構(gòu)的虛擬機(jī)上,用來控制虛擬機(jī)內(nèi)存訪問的親合性。

使用方法如下:

xml:

<domain>...<numatune><memory mode="strict" nodeset="0-1"/><memnode cellid="0" mode="strict" nodeset="0"/><memnode cellid="1" mode="strict" nodeset="1"/></numatune>... </domain>

numatune里由<memory>和<memnode>兩部分組成

numatune memory

numatune momory里的值會(huì)寫到 cgroup cpuset.mems里,libvirt 對(duì)cgroup進(jìn)行設(shè)置

libvirt

對(duì)emulator線程進(jìn)行cgroup cpuset.mems 設(shè)置

if (mem_mask)if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,false, &cgroup_temp) < 0 ||virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0)

對(duì)vcpu線程進(jìn)行 cgroup cpuset.mems 設(shè)置

for (i = 0; i < maxvcpus; i++) {vcpu = virDomainDefGetVcpu(vm->def, i);if (!vcpu->online)continue;if (qemuProcessSetupVcpu(vm, i) < 0)return -1;}qemuProcessSetupVcpu(virDomainObjPtr vm, unsigned int vcpuid) {pid_t vcpupid = qemuDomainGetVcpuPid(vm, vcpuid); //獲取vcpu線程idif (qemuProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,vcpuid, vcpu->cpumask,vm->def->cputune.period,vm->def->cputune.quota,&vcpu->sched) < 0) }

內(nèi)核

設(shè)置好cgroup,內(nèi)核就要根據(jù)cgroup設(shè)置的值對(duì)進(jìn)程的vma進(jìn)行遷移,以及在指定里node為進(jìn)程分配頁面,主要在 update_tasks_nodemask 函數(shù)中實(shí)現(xiàn)。

update_tasks_nodemask會(huì)遍歷當(dāng)前cgroup下所有的task_struct進(jìn)行操作

1、修改task_struct->mems_allowed

2、遍歷task_struct下所有的vma,修改vma->vm_policy。cgroup這個(gè)修改vma->vm_policy的功能,是在 4.x 內(nèi)核里才添加的,3.x 的內(nèi)核沒有這個(gè)功能

3、判斷是否進(jìn)行遷移,如果設(shè)置了遷移了,則把不在設(shè)置node的頁面,全部遷移到指定node 里

css_task_iter_start(&cs->css, 0, &it);while ((task = css_task_iter_next(&it))) {struct mm_struct *mm;bool migrate;cpuset_change_task_nodemask(task, &newmems); // 修改task_struct->mems_allowed 為cpuset.mems的值mm = get_task_mm(task);if (!mm)continue;migrate = is_memory_migrate(cs); //判斷是否可以遷移mpol_rebind_mm(mm, &cs->mems_allowed); //遍歷task的所有vma,對(duì)所有的vma->vm_policy 修改為 cpuset.mems的值if (migrate)cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); //如果可以遷移,則把不在cpuset.mems的node的頁面都遷移到指定node上elsemmput(mm);}

migrate = is_memory_migrate(cs); 怎么判斷是否可以遷移的,在cpuset.memory_migrate設(shè)置,目前看這個(gè)默認(rèn)值是 true,表示會(huì)遷移

[root@wl.localhost.com ~]# cat /cgroup/cpuset/libvirt/qemu-29-instance-535b3c33-49e1-4f01-9192-18af59d49af8/emulator/cpuset.memory_migrate 1

numatune memnode

qemu調(diào)用mbind函數(shù)實(shí)現(xiàn)對(duì)虛擬機(jī)內(nèi)存node的綁定

qemu

如果設(shè)置了numatune,看到qemu的參數(shù)會(huì)多出來host-nodes=0,policy=bind.

-object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu/17-centos,share=yes,size=2147483648,host-nodes=0,policy=bind

遍歷所有的object

qemu_opts_foreach(qemu_find_opts("object"),user_creatable_add_opts_foreach,object_create_delayed, NULL))

host_memory_backend_memory_complete

這個(gè)函數(shù)主要三兩個(gè)功能

1、alloc,分配內(nèi)存,打開大頁文件,設(shè)置文件大小,初始化vma。

2、mbind 設(shè)置NUMA 內(nèi)存訪問策略,flag = MPOL_MF_STRICT | MPOL_MF_MOVE;? ?MPOL_MF_MOVE 表示移動(dòng)不在指定node的頁到指定node

3、prealloc 進(jìn)行內(nèi)存預(yù)分配

static void host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) {HostMemoryBackend *backend = MEMORY_BACKEND(uc);HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);Error *local_err = NULL;void *ptr;uint64_t sz;if (bc->alloc) {bc->alloc(backend, &local_err);if (local_err) {goto out;}..........ptr = memory_region_get_ram_ptr(&backend->mr);sz = memory_region_size(&backend->mr);/* ensure policy won't be ignored in case memory is preallocated* before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so* this doesn't catch hugepage case. */unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;if (mbind(ptr, sz, backend->policy,maxnode ? backend->host_nodes : NULL, maxnode + 1, flags)) {if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {error_setg_errno(errp, errno,"cannot bind memory to host NUMA nodes");return;}}/* Preallocate memory after the NUMA policy has been instantiated.* This is necessary to guarantee memory is allocated with* specified NUMA policy in place.*/if (backend->prealloc) {os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,smp_cpus, &local_err);if (local_err) {goto out;}}} }

內(nèi)核

mbind函數(shù)實(shí)現(xiàn)原理

1、創(chuàng)建一個(gè)mempolicy new

2、調(diào)用mbind_range,對(duì)start + len 的所有vma,進(jìn)行vma->vm_policy設(shè)置

3、遷移page

static long do_mbind(unsigned long start, unsigned long len,unsigned short mode, unsigned short mode_flags,nodemask_t *nmask, unsigned long flags) {struct mm_struct *mm = current->mm;struct mempolicy *new;LIST_HEAD(pagelist);len = (len + PAGE_SIZE - 1) & PAGE_MASK;end = start + len;new = mpol_new(mode, mode_flags, nmask);ret = queue_pages_range(mm, start, end, nmask,flags | MPOL_MF_INVERT, &pagelist);err = mbind_range(mm, start, end, new);if (!err) {if (!list_empty(&pagelist)) {nr_failed = migrate_pages(&pagelist, new_page, NULL,start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);}} }

mbind_range

對(duì)start + len 的所有vma,進(jìn)行vma→vm_policy設(shè)置,設(shè)置為numatune里對(duì)應(yīng)的 policy

for (; vma && vma->vm_start < end; prev = vma, vma = next) {next = vma->vm_next;vmstart = max(start, vma->vm_start);vmend = min(end, vma->vm_end);pgoff = vma->vm_pgoff +((vmstart - vma->vm_start) >> PAGE_SHIFT);prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,vma->anon_vma, vma->vm_file, pgoff,new_pol, vma->vm_userfaultfd_ctx);replace:err = vma_replace_policy(vma, new_pol); //遍歷vma,設(shè)置vma->vm_policyif (err)goto out;}

alloc page

頁面分配器的函數(shù)在內(nèi)核中有著各種各樣的版本,不論是返回虛擬地址的還是返回struct page指針的,最終都會(huì)調(diào)用一個(gè)共同的接口:__alloc_pages_nodemask()

struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask) {struct page *page;unsigned int alloc_flags = ALLOC_WMARK_LOW; //先嘗試從LOW水位分配gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */struct alloc_context ac = { };gfp_mask &= gfp_allowed_mask;alloc_mask = gfp_mask;if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))return NULL;/* First allocation attempt */page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);page = __alloc_pages_slowpath(alloc_mask, order, &ac);return page; }

通過prepare_alloc_pages函數(shù)初始化alloc_context,確認(rèn)下分配的zone,zonelist,mask,以及遷移類型。

分配node時(shí),會(huì)先判斷vma里沒有,優(yōu)先使用vm->vm_policy里設(shè)定的node,否則采用task_struct 里的 mems_allowed。

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,int preferred_nid, nodemask_t *nodemask,struct alloc_context *ac, gfp_t *alloc_mask,unsigned int *alloc_flags) {ac->high_zoneidx = gfp_zone(gfp_mask);ac->zonelist = node_zonelist(preferred_nid, gfp_mask);ac->nodemask = nodemask;ac->migratetype = gfpflags_to_migratetype(gfp_mask);if (cpusets_enabled()) { //當(dāng)開啟了cpuset的功能*alloc_mask |= __GFP_HARDWALL;if (!ac->nodemask) //如果vma里沒有 nodemask,則采用task_struct 里的 mems_allowedac->nodemask = &cpuset_current_mems_allowed;else*alloc_flags |= ALLOC_CPUSET;}fs_reclaim_acquire(gfp_mask);fs_reclaim_release(gfp_mask);might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);if (should_fail_alloc_page(gfp_mask, order))return false;if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)*alloc_flags |= ALLOC_CMA;return true; }

結(jié)論

memory 最終作用于 cgroup里的 cpuset.mems, memnode 作用于 qemu 里 hostnode=,最終使用 mbind 函數(shù)

memory細(xì)度大,針對(duì)的是整個(gè)進(jìn)程,memnode細(xì)度細(xì),只針對(duì)特定的vma設(shè)置相應(yīng)的vma->vm_policy。

在分配頁面時(shí),會(huì)首先選擇vma->vm_policy里設(shè)定的node,在vma->vm_policy里沒有時(shí)候才使用task_struct->mems_allowed,如果都沒有則使用當(dāng)前cpu的node.

cgroup里默認(rèn)會(huì)遷移,在cgroup設(shè)置時(shí)會(huì)遷移不在指定node的頁在, mbind時(shí)因?yàn)樵O(shè)置了MPOL_MF_MOVE也會(huì)遷移

參考

https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/virtualization_tuning_and_optimization_guide/sect-virtualization_tuning_optimization_guide-numa-numa_and_libvirt

總結(jié)

以上是生活随笔為你收集整理的libvirt numatune 原理的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò),歡迎將生活随笔推薦給好友。