libvirt numatune 原理
目錄
numatune是什么
numatune memory
libvirt
內核
numatune memnode
qemu
內核
alloc page
結論
numatune是什么
numatune是libvirt的一個參數,可以用在numa架構的虛擬機上,用來控制虛擬機內存訪問的親合性。
使用方法如下:
xml:
<domain>...<numatune><memory mode="strict" nodeset="0-1"/><memnode cellid="0" mode="strict" nodeset="0"/><memnode cellid="1" mode="strict" nodeset="1"/></numatune>... </domain>numatune里由<memory>和<memnode>兩部分組成
numatune memory
numatune momory里的值會寫到 cgroup cpuset.mems里,libvirt 對cgroup進行設置
libvirt
對emulator線程進行cgroup cpuset.mems 設置
if (mem_mask)if (virCgroupNewThread(priv->cgroup, VIR_CGROUP_THREAD_EMULATOR, 0,false, &cgroup_temp) < 0 ||virCgroupSetCpusetMems(cgroup_temp, mem_mask) < 0)對vcpu線程進行 cgroup cpuset.mems 設置
for (i = 0; i < maxvcpus; i++) {vcpu = virDomainDefGetVcpu(vm->def, i);if (!vcpu->online)continue;if (qemuProcessSetupVcpu(vm, i) < 0)return -1;}qemuProcessSetupVcpu(virDomainObjPtr vm, unsigned int vcpuid) {pid_t vcpupid = qemuDomainGetVcpuPid(vm, vcpuid); //獲取vcpu線程idif (qemuProcessSetupPid(vm, vcpupid, VIR_CGROUP_THREAD_VCPU,vcpuid, vcpu->cpumask,vm->def->cputune.period,vm->def->cputune.quota,&vcpu->sched) < 0) }內核
設置好cgroup,內核就要根據cgroup設置的值對進程的vma進行遷移,以及在指定里node為進程分配頁面,主要在 update_tasks_nodemask 函數中實現。
update_tasks_nodemask會遍歷當前cgroup下所有的task_struct進行操作
1、修改task_struct->mems_allowed
2、遍歷task_struct下所有的vma,修改vma->vm_policy。cgroup這個修改vma->vm_policy的功能,是在 4.x 內核里才添加的,3.x 的內核沒有這個功能
3、判斷是否進行遷移,如果設置了遷移了,則把不在設置node的頁面,全部遷移到指定node 里
css_task_iter_start(&cs->css, 0, &it);while ((task = css_task_iter_next(&it))) {struct mm_struct *mm;bool migrate;cpuset_change_task_nodemask(task, &newmems); // 修改task_struct->mems_allowed 為cpuset.mems的值mm = get_task_mm(task);if (!mm)continue;migrate = is_memory_migrate(cs); //判斷是否可以遷移mpol_rebind_mm(mm, &cs->mems_allowed); //遍歷task的所有vma,對所有的vma->vm_policy 修改為 cpuset.mems的值if (migrate)cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); //如果可以遷移,則把不在cpuset.mems的node的頁面都遷移到指定node上elsemmput(mm);}migrate = is_memory_migrate(cs); 怎么判斷是否可以遷移的,在cpuset.memory_migrate設置,目前看這個默認值是 true,表示會遷移
[root@wl.localhost.com ~]# cat /cgroup/cpuset/libvirt/qemu-29-instance-535b3c33-49e1-4f01-9192-18af59d49af8/emulator/cpuset.memory_migrate 1numatune memnode
qemu調用mbind函數實現對虛擬機內存node的綁定
qemu
如果設置了numatune,看到qemu的參數會多出來host-nodes=0,policy=bind.
-object memory-backend-file,id=ram-node0,prealloc=yes,mem-path=/dev/hugepages/libvirt/qemu/17-centos,share=yes,size=2147483648,host-nodes=0,policy=bind遍歷所有的object
qemu_opts_foreach(qemu_find_opts("object"),user_creatable_add_opts_foreach,object_create_delayed, NULL))host_memory_backend_memory_complete
這個函數主要三兩個功能
1、alloc,分配內存,打開大頁文件,設置文件大小,初始化vma。
2、mbind 設置NUMA 內存訪問策略,flag = MPOL_MF_STRICT | MPOL_MF_MOVE;? ?MPOL_MF_MOVE 表示移動不在指定node的頁到指定node
3、prealloc 進行內存預分配
static void host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) {HostMemoryBackend *backend = MEMORY_BACKEND(uc);HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);Error *local_err = NULL;void *ptr;uint64_t sz;if (bc->alloc) {bc->alloc(backend, &local_err);if (local_err) {goto out;}..........ptr = memory_region_get_ram_ptr(&backend->mr);sz = memory_region_size(&backend->mr);/* ensure policy won't be ignored in case memory is preallocated* before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so* this doesn't catch hugepage case. */unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;if (mbind(ptr, sz, backend->policy,maxnode ? backend->host_nodes : NULL, maxnode + 1, flags)) {if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {error_setg_errno(errp, errno,"cannot bind memory to host NUMA nodes");return;}}/* Preallocate memory after the NUMA policy has been instantiated.* This is necessary to guarantee memory is allocated with* specified NUMA policy in place.*/if (backend->prealloc) {os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,smp_cpus, &local_err);if (local_err) {goto out;}}} }內核
mbind函數實現原理
1、創建一個mempolicy new
2、調用mbind_range,對start + len 的所有vma,進行vma->vm_policy設置
3、遷移page
static long do_mbind(unsigned long start, unsigned long len,unsigned short mode, unsigned short mode_flags,nodemask_t *nmask, unsigned long flags) {struct mm_struct *mm = current->mm;struct mempolicy *new;LIST_HEAD(pagelist);len = (len + PAGE_SIZE - 1) & PAGE_MASK;end = start + len;new = mpol_new(mode, mode_flags, nmask);ret = queue_pages_range(mm, start, end, nmask,flags | MPOL_MF_INVERT, &pagelist);err = mbind_range(mm, start, end, new);if (!err) {if (!list_empty(&pagelist)) {nr_failed = migrate_pages(&pagelist, new_page, NULL,start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);}} }mbind_range
對start + len 的所有vma,進行vma→vm_policy設置,設置為numatune里對應的 policy
for (; vma && vma->vm_start < end; prev = vma, vma = next) {next = vma->vm_next;vmstart = max(start, vma->vm_start);vmend = min(end, vma->vm_end);pgoff = vma->vm_pgoff +((vmstart - vma->vm_start) >> PAGE_SHIFT);prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,vma->anon_vma, vma->vm_file, pgoff,new_pol, vma->vm_userfaultfd_ctx);replace:err = vma_replace_policy(vma, new_pol); //遍歷vma,設置vma->vm_policyif (err)goto out;}alloc page
頁面分配器的函數在內核中有著各種各樣的版本,不論是返回虛擬地址的還是返回struct page指針的,最終都會調用一個共同的接口:__alloc_pages_nodemask()
struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask) {struct page *page;unsigned int alloc_flags = ALLOC_WMARK_LOW; //先嘗試從LOW水位分配gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */struct alloc_context ac = { };gfp_mask &= gfp_allowed_mask;alloc_mask = gfp_mask;if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))return NULL;/* First allocation attempt */page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);page = __alloc_pages_slowpath(alloc_mask, order, &ac);return page; }通過prepare_alloc_pages函數初始化alloc_context,確認下分配的zone,zonelist,mask,以及遷移類型。
分配node時,會先判斷vma里沒有,優先使用vm->vm_policy里設定的node,否則采用task_struct 里的 mems_allowed。
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,int preferred_nid, nodemask_t *nodemask,struct alloc_context *ac, gfp_t *alloc_mask,unsigned int *alloc_flags) {ac->high_zoneidx = gfp_zone(gfp_mask);ac->zonelist = node_zonelist(preferred_nid, gfp_mask);ac->nodemask = nodemask;ac->migratetype = gfpflags_to_migratetype(gfp_mask);if (cpusets_enabled()) { //當開啟了cpuset的功能*alloc_mask |= __GFP_HARDWALL;if (!ac->nodemask) //如果vma里沒有 nodemask,則采用task_struct 里的 mems_allowedac->nodemask = &cpuset_current_mems_allowed;else*alloc_flags |= ALLOC_CPUSET;}fs_reclaim_acquire(gfp_mask);fs_reclaim_release(gfp_mask);might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);if (should_fail_alloc_page(gfp_mask, order))return false;if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)*alloc_flags |= ALLOC_CMA;return true; }結論
memory 最終作用于 cgroup里的 cpuset.mems, memnode 作用于 qemu 里 hostnode=,最終使用 mbind 函數
memory細度大,針對的是整個進程,memnode細度細,只針對特定的vma設置相應的vma->vm_policy。
在分配頁面時,會首先選擇vma->vm_policy里設定的node,在vma->vm_policy里沒有時候才使用task_struct->mems_allowed,如果都沒有則使用當前cpu的node.
cgroup里默認會遷移,在cgroup設置時會遷移不在指定node的頁在, mbind時因為設置了MPOL_MF_MOVE也會遷移
參考
https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/virtualization_tuning_and_optimization_guide/sect-virtualization_tuning_optimization_guide-numa-numa_and_libvirt
總結
以上是生活随笔為你收集整理的libvirt numatune 原理的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: C#Directory常用方法
- 下一篇: 《雍正皇帝》文化专有词翻译策略的研究现状