cgroup代码浅析(2)
info
數(shù)據(jù)結(jié)構(gòu)
Cgroup和Task的關(guān)聯(lián)
task->css_set
struct task_struct {struct css_set __rcu *cgroups; // 每個(gè)進(jìn)程中,都對(duì)應(yīng)有一個(gè)css_set結(jié)構(gòu)體,css_set其實(shí)就是cgroup_subsys_state對(duì)象的集合,而每個(gè)cgroup_subsys_state代表一個(gè)subsystem... }struct css_set {struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];... }css_set的初始化發(fā)生在kernel boot,從如下代碼可見(jiàn)
asmlinkage __visible void __init start_kernel(void) {cpuset_init();cgroup_init();... }一個(gè)task可以屬于多個(gè)cgroup,一個(gè)cgroup也可以擁有多個(gè)task,這種M:N的關(guān)系,linux kernel中是通過(guò)cgrp_cset_link結(jié)構(gòu)體表示的:
/** A cgroup can be associated with multiple css_sets as different tasks may* belong to different cgroups on different hierarchies. In the other* direction, a css_set is naturally associated with multiple cgroups.* This M:N relationship is represented by the following link structure* which exists for each association and allows traversing the associations* from both sides.*/ struct cgrp_cset_link {/* the cgroup and css_set this link associates */struct cgroup *cgrp;struct css_set *cset;/* list of cgrp_cset_links anchored at cgrp->cset_links */struct list_head cset_link;/* list of cgrp_cset_links anchored at css_set->cgrp_links */struct list_head cgrp_link; };這個(gè)結(jié)構(gòu)其實(shí)就是一個(gè)link,cgrp就是這個(gè)link關(guān)聯(lián)的cgroup,cset屬于一個(gè)task,于是可以代表一個(gè)進(jìn)程。
而cset_link是給struct cgroup查找struct cgrp_cset_link用的。那么怎么找呢?
我們首先來(lái)看如何把一個(gè)cgroup與一個(gè)css_set關(guān)聯(lián)起來(lái)
/*** link_css_set - a helper function to link a css_set to a cgroup* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()* @cset: the css_set to be linked* @cgrp: the destination cgroup*//* link_css_set函數(shù)的功能就是把一個(gè)css_set與一個(gè)cgroup通過(guò)struct */cgrp_cset_link聯(lián)系起來(lái)。 static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp) {struct cgrp_cset_link *link;BUG_ON(list_empty(tmp_links));if (cgroup_on_dfl(cgrp))cset->dfl_cgrp = cgrp;// 從已經(jīng)分配好的一個(gè)cgrp_cset_link鏈表(表頭為tmp_links)中拿一個(gè)出來(lái),填上cgroup與css_set的指針link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);link->cset = cset;link->cgrp = cgrp;// 把這個(gè)cgrp_cset_link從原來(lái)的鏈表中移出來(lái),加入到cgrp(這個(gè)就是那個(gè)cgroup)的cset_links鏈表中l(wèi)ist_move_tail(&link->cset_link, &cgrp->cset_links);// 把cgrp_cset_link的cgrp_link加入到cset的cgrp_links鏈表中l(wèi)ist_add_tail(&link->cgrp_link, &cset->cgrp_links);if (cgroup_parent(cgrp))cgroup_get(cgrp); }上面注釋中提到,用于分配cgrp_cset_link(表頭為tmp_links)的函數(shù)是allocate_cgrp_cset_links,其定義如下:
/*** allocate_cgrp_cset_links - allocate cgrp_cset_links* @count: the number of links to allocate* @tmp_links: list_head the allocated links are put on** Allocate @count cgrp_cset_link structures and chain them on @tmp_links* through ->cset_link. Returns 0 on success or -errno.*/ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) {struct cgrp_cset_link *link;int i;INIT_LIST_HEAD(tmp_links);for (i = 0; i < count; i++) {link = kzalloc(sizeof(*link), GFP_KERNEL);if (!link) {free_cgrp_cset_links(tmp_links);return -ENOMEM;}list_add(&link->cset_link, tmp_links);}return 0; }這個(gè)函數(shù)很簡(jiǎn)單,就是申請(qǐng)count個(gè)struct cgrp_cset_link,同時(shí)把它們一個(gè)個(gè)加到tmp_links這個(gè)鏈表里。這count的數(shù)據(jù)結(jié)構(gòu)是通過(guò)struct cgrp_cset_link->cset_link連接起來(lái)的,但是前面說(shuō)到這個(gè)變量是給struct cgroup用的。這是因?yàn)槟壳胺峙涑鰜?lái)的這些個(gè)數(shù)據(jù)結(jié)構(gòu)只是臨時(shí)的,也就是說(shuō)暫時(shí)借用一下這個(gè)變量,到后面會(huì)再來(lái)恢復(fù)這個(gè)變量的本來(lái)用途。這也是為什么link_css_set函數(shù)中cgrp_link成員用list_add,而cset_link用list_move。
于是,可以用下圖來(lái)表示allocate_cgrp_cset_links的結(jié)果:
而link_css_set的結(jié)果則可以用下圖來(lái)表示:
這張圖也解釋了linux代碼中如何表現(xiàn)cgroup與subsystem之間多對(duì)多的關(guān)系。每個(gè)struct cgroup可以通過(guò)cgroup->cset_links和cgrp_cset_link->cset_link找到一串struct cgrp_cset_link,每個(gè)struct cgrp_cset_link都有著對(duì)應(yīng)的css_set,這個(gè)css_set屬于一個(gè)tast_struct(其實(shí)是多個(gè)),其中包含著subsystem。
于是通過(guò)遍歷鏈表就能找到這個(gè)cgroup對(duì)應(yīng)的所有task(其實(shí)找到的是css_set,但是對(duì)于Cgroups這個(gè)模塊來(lái)說(shuō),關(guān)心的并不是task_struct,而是這個(gè)css_set)。反之亦然,通過(guò)task_struct的cgroups變量(類(lèi)型為struct css_set*)就能找到這個(gè)進(jìn)程屬于的所有cgroup。
例如,給定一個(gè)task,我們想找到這個(gè)task在某個(gè)hierarchy中的cgroup,就可以調(diào)用如下函數(shù):linux-4.4.19/kernel/cgroup.c
/** Return the cgroup for "task" from the given hierarchy. Must be* called with cgroup_mutex and css_set_lock held.*/ static struct cgroup *task_cgroup_from_root(struct task_struct *task,struct cgroup_root *root) {/** No need to lock the task - since we hold cgroup_mutex the* task can't change groups, so the only thing that can happen* is that it exits and its css is set back to init_css_set.*/return cset_cgroup_from_root(task_css_set(task), root); } /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,struct cgroup_root *root) {struct cgroup *res = NULL;lockdep_assert_held(&cgroup_mutex);lockdep_assert_held(&css_set_lock);if (cset == &init_css_set) {res = &root->cgrp;} else {struct cgrp_cset_link *link;list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {struct cgroup *c = link->cgrp;if (c->root == root) {res = c;break;}}}BUG_ON(!res);return res; }Cgroup與subsystem
linux-4.4.19/include/linux/cgroupsubsys.h中定義了所有的subsystem。
可以看到,共有cpuset, debug, cpu, cpuacct, memory, devices, freezer, netcls, blkio, perfevent, netprio, hugtlb等12個(gè).
cpu subsystem
struct task_group就是cpu subsystem對(duì)應(yīng)的子類(lèi), 代碼見(jiàn)
/* task group related information */ struct task_group {struct cgroup_subsys_state css;#ifdef CONFIG_FAIR_GROUP_SCHED/* schedulable entities of this group on each cpu */struct sched_entity **se;/* runqueue "owned" by this group on each cpu */struct cfs_rq **cfs_rq;unsigned long shares;#ifdef CONFIG_SMPatomic_long_t load_avg; #endif #endif#ifdef CONFIG_RT_GROUP_SCHEDstruct sched_rt_entity **rt_se;struct rt_rq **rt_rq;struct rt_bandwidth rt_bandwidth; #endifstruct rcu_head rcu;struct list_head list;struct task_group *parent;struct list_head siblings;struct list_head children;#ifdef CONFIG_SCHED_AUTOGROUPstruct autogroup *autogroup; #endifstruct cfs_bandwidth cfs_bandwidth; };Cgroups通過(guò)VFS來(lái)和用戶(hù)打交道, 用戶(hù)通過(guò)將各個(gè)subsystem mount到某個(gè)目錄下之后, cgroup文件系統(tǒng)會(huì)自動(dòng)創(chuàng)建一系列虛擬文件, 用戶(hù)通過(guò)向不同的文件讀寫(xiě)數(shù)據(jù)控制Cgroups的行為. 具體對(duì)CPU subsystem來(lái)說(shuō), 有一個(gè)tasks文件, 向其中寫(xiě)入一些進(jìn)程的pid, 就能將這些進(jìn)程加入到這個(gè)cgroup. 另外還有個(gè)cpu.shares的文件, 向其中寫(xiě)入一個(gè)數(shù)字后就能設(shè)置這個(gè)cgroup的進(jìn)程的weight.
每個(gè)文件系統(tǒng)(包括Cgroups對(duì)應(yīng)的cgroup文件系統(tǒng))擁有一個(gè)數(shù)據(jù)結(jié)構(gòu), 其中有一系列函數(shù)指針, 當(dāng)對(duì)這個(gè)文件系統(tǒng)進(jìn)行讀寫(xiě)操作時(shí), 內(nèi)核會(huì)調(diào)用這個(gè)文件系統(tǒng)的對(duì)應(yīng)函數(shù)指針. 因此當(dāng)向一個(gè)VFS的文件寫(xiě)入數(shù)據(jù)時(shí), 可以在這個(gè)函數(shù)指針指向的函數(shù)做一些其他事情. 具體對(duì)于CPU subsystem, 當(dāng)向cpu.shares寫(xiě)入一個(gè)數(shù)字時(shí), 內(nèi)核執(zhí)行的函數(shù)干的事情是修改這個(gè)cgroup對(duì)應(yīng)的struct task_group中的shares變量. 這個(gè)函數(shù)是:
linux-4.4.19/kernel/sched/core.c #8270
其中, csstg函數(shù)是找到具體的subsystem子類(lèi), 這里就是struct taskcgroup. schedgroupset_shares這個(gè)函數(shù)的定義如下:
int sched_group_set_shares(struct task_group *tg, unsigned long shares) {int i;unsigned long flags;/** We can't change the weight of the root cgroup.*/if (!tg->se[0])return -EINVAL;shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));mutex_lock(&shares_mutex);if (tg->shares == shares)goto done;tg->shares = shares;for_each_possible_cpu(i) {struct rq *rq = cpu_rq(i);struct sched_entity *se;se = tg->se[i];/* Propagate contribution to hierarchy */raw_spin_lock_irqsave(&rq->lock, flags);/* Possible calls to update_curr() need rq clock */update_rq_clock(rq);for_each_sched_entity(se)update_cfs_shares(group_cfs_rq(se));raw_spin_unlock_irqrestore(&rq->lock, flags);}done:mutex_unlock(&shares_mutex);return 0; }變量
根組:
extern struct mem_cgroup *root_mem_cgroup;函數(shù)
從page獲取mem_cgroup: page_mem_cgroup()
static inline struct mem_cgroup *page_mem_cgroup(struct page *page) {return page->mem_cgroup; }從pgdata + memcg 獲取lru: mem_cgroup_lruvec()
static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,struct mem_cgroup *memcg) {struct mem_cgroup_per_node *mz;struct lruvec *lruvec;// 如果沒(méi)有開(kāi)啟memcg,則,lru等于node上的lruif (mem_cgroup_disabled()) {lruvec = node_lruvec(pgdat);goto out;}// 獲取memcg里對(duì)應(yīng)的node的mz,mz里保存了這個(gè)memcg在這個(gè)node上的lruvecmz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);lruvec = &mz->lruvec; out:/** Since a node can be onlined after the mem_cgroup was created,* we have to be prepared to initialize lruvec->pgdat here;* and if offlined then reonlined, we need to reinitialize it.*/if (unlikely(lruvec->pgdat != pgdat))lruvec->pgdat = pgdat;return lruvec; }例子:
static void reclaim_pages_from_memcg(struct mem_cgroup *memcg) {pg_data_t *pgdat;struct lruvec *lruvec;pgdat = NODE_DATA(nid);lruvec = mem_cgroup_lruvec(pgdat, memcg); }常見(jiàn)函數(shù)
mem_cgroup_disabled()打印相關(guān):
memcg_stat_show()charge 相關(guān):
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,gfp_t gfp_mask, struct mem_cgroup **memcgp,bool compound); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,bool lrucare, bool compound); void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,bool compound); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list);charge/uncharge
mem_cgroup_uncharge
void mem_cgroup_uncharge(struct page *page) {if (mem_cgroup_disabled())return;/* Don't touch page->lru of any random page, pre-check: */if (!page->mem_cgroup)return;INIT_LIST_HEAD(&page->lru);uncharge_list(&page->lru); }memcg_stat_show
static int memcg_stat_show(struct seq_file *m, void *v) {struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));unsigned long memory, memsw;struct mem_cgroup *mi;unsigned int i;struct accumulated_stats acc;BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=MEM_CGROUP_STAT_NSTATS);BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=MEM_CGROUP_EVENTS_NSTATS);BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())continue;seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);}for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],mem_cgroup_read_events(memcg, i));for (i = 0; i < NR_LRU_LISTS; i++)seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);Refs
https://www.cnblogs.com/yjf512/p/6003094.html
https://blog.csdn.net/WaltonWang/article/details/53899191
轉(zhuǎn)載于:https://www.cnblogs.com/muahao/p/10281139.html
總結(jié)
以上是生活随笔為你收集整理的cgroup代码浅析(2)的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: Java程序员从阿里、京东、美团面试回来
- 下一篇: 落地华东总部、上线创新云、签约AIoT产