當前位置：首頁 >

Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】

發布時間：2025/3/18 49 豆豆

生活随笔收集整理的這篇文章主要介紹了 Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

轉自：https://blog.csdn.net/21cnbao/article/details/7309757

在我們使用ARM等嵌入式Linux系統的時候，一個頭疼的問題是GPU，Camera，HDMI等都需要預留大量連續內存，這部分內存平時不用，但是一般的做法又必須先預留著。目前，Marek Szyprowski和Michal Nazarewicz實現了一套全新的Contiguous Memory Allocator。通過這套機制，我們可以做到不預留內存，這些內存平時是可用的，只有當需要的時候才被分配給Camera，HDMI等設備。下面分析它的基本代碼流程。

聲明連續內存

內核啟動過程中arch/arm/mm/init.c中的arm_memblock_init()會調用dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));

該函數位于：drivers/base/dma-contiguous.c

/**

* dma_contiguous_reserve() - reserve area for contiguous memory handling

* @limit: End address of the reserved memory (optional, 0 for any).

* This function reserves memory from early allocator. It should be

* called by arch specific code once the early allocator (memblock or bootmem)

* has been activated and all other subsystems have already allocated/reserved

* memory.

void __init dma_contiguous_reserve(phys_addr_t limit)

{

unsigned long selected_size = 0;

pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);

if (size_cmdline != -1) {

selected_size = size_cmdline;

} else {

#ifdef CONFIG_CMA_SIZE_SEL_MBYTES

selected_size = size_bytes;

#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)

selected_size = cma_early_percent_memory();

#elif defined(CONFIG_CMA_SIZE_SEL_MIN)

selected_size = min(size_bytes, cma_early_percent_memory());

#elif defined(CONFIG_CMA_SIZE_SEL_MAX)

selected_size = max(size_bytes, cma_early_percent_memory());

#endif

}

if (selected_size) {

pr_debug("%s: reserving %ld MiB for global area\n", __func__,

selected_size / SZ_1M);

dma_declare_contiguous(NULL, selected_size, 0, limit);

}

};

其中的size_bytes定義為：

static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M; 默認情況下，CMA_SIZE_MBYTES會被定義為16MB，來源于CONFIG_CMA_SIZE_MBYTES=16

int __init dma_declare_contiguous(struct device *dev, unsigned long size,

phys_addr_t base, phys_addr_t limit)

{

...

/* Reserve memory */

if (base) {

if (memblock_is_region_reserved(base, size) ||

memblock_reserve(base, size) < 0) {

base = -EBUSY;

goto err;

}

} else {

* Use __memblock_alloc_base() since

* memblock_alloc_base() panic()s.

phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);

if (!addr) {

base = -ENOMEM;

goto err;

} else if (addr + size > ~(unsigned long)0) {

memblock_free(addr, size);

base = -EINVAL;

goto err;

} else {

base = addr;

}

???????? * Each reserved area must be initialised later, when more kernel

???????? * subsystems (like slab allocator) are available.

???????? */

r->start = base;

r->size = size;

r->dev = dev;

cma_reserved_count++;

pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M,

(unsigned long)base);

/* Architecture specific contiguous memory fixup. */

dma_contiguous_early_fixup(base, size);

return 0;

err:

pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M);

return base;

}

由此可見，連續內存區域也是在內核啟動的早期，通過__memblock_alloc_base()拿到的。

另外：

drivers/base/dma-contiguous.c里面的core_initcall()會導致cma_init_reserved_areas()被調用：

static int __init cma_init_reserved_areas(void)

{

struct cma_reserved *r = cma_reserved;

unsigned i = cma_reserved_count;

pr_debug("%s()\n", __func__);

for (; i; --i, ++r) {

struct cma *cma;

cma = cma_create_area(PFN_DOWN(r->start),

r->size >> PAGE_SHIFT);

if (!IS_ERR(cma))

dev_set_cma_area(r->dev, cma);

}

return 0;

}

core_initcall(cma_init_reserved_areas);

cma_create_area()會調用cma_activate_area(),cma_activate_area()函數則會針對每個page調用：

init_cma_reserved_pageblock(pfn_to_page(base_pfn));

這個函數則會通過set_pageblock_migratetype(page, MIGRATE_CMA)將頁設置為MIGRATE_CMA類型的：

#ifdef CONFIG_CMA

/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */

void __init init_cma_reserved_pageblock(struct page *page)

{

unsigned i = pageblock_nr_pages;

struct page *p = page;

do {

__ClearPageReserved(p);

set_page_count(p, 0);

} while (++p, --i);

set_page_refcounted(page);

set_pageblock_migratetype(page, MIGRATE_CMA);

__free_pages(page, pageblock_order);

totalram_pages += pageblock_nr_pages;

}

#endif

同時其中調用的__free_pages(page, pageblock_order);最終會調用到__free_one_page(page, zone, order, migratetype);
相關的page會被加到MIGRATE_CMA的free_list上面去：

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

申請連續內存

申請連續內存仍然使用標準的arch/arm/mm/dma-mapping.c中定義的dma_alloc_coherent()和dma_alloc_writecombine()，這二者會間接調用drivers/base/dma-contiguous.c中的

struct page *dma_alloc_from_contiguous(struct device *dev, int count,

unsigned int align)

struct page *dma_alloc_from_contiguous(struct device *dev, int count,

unsigned int align)

{

...

for (;;) {

pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,

start, count, mask);

if (pageno >= cma->count) {

ret = -ENOMEM;

goto error;

}

pfn = cma->base_pfn + pageno;

ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);

if (ret == 0) {

bitmap_set(cma->bitmap, pageno, count);

break;

} else if (ret != -EBUSY) {

goto error;

}

pr_debug("%s(): memory range at %p is busy, retrying\n",

__func__, pfn_to_page(pfn));

/* try again with a bit different memory target */

start = pageno + mask + 1;

}

...

}

int alloc_contig_range(unsigned long start, unsigned long end,

?????????????????????? unsigned migratetype)

需要隔離page，隔離page的作用通過代碼的注釋可以體現：

* What we do here is we mark all pageblocks in range as

* MIGRATE_ISOLATE. Because of the way page allocator work, we

* align the range to MAX_ORDER pages so that page allocator

* won't try to merge buddies from different pageblocks and

* change MIGRATE_ISOLATE to some other migration type.

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

* migrate the pages from an unaligned range (ie. pages that

* we are interested in). This will put all the pages in

* range back to page allocator as MIGRATE_ISOLATE.

* When this is done, we take the pages in range from page

* allocator removing them from the buddy system. This way

* page allocator will never consider using them.

* This lets us mark the pageblocks back as

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

* MAX_ORDER aligned range but not in the unaligned, original

* range are put back to page allocator so that buddy can use

* them.

ret = start_isolate_page_range(pfn_align_to_maxpage_down(start),

pfn_align_to_maxpage_up(end),

migratetype);

簡單地說，就是把相關的page標記為MIGRATE_ISOLATE，這樣buddy系統就不會再使用他們。

* start_isolate_page_range() -- make page-allocation-type of range of pages

* to be MIGRATE_ISOLATE.

* @start_pfn: The lower PFN of the range to be isolated.

* @end_pfn: The upper PFN of the range to be isolated.

* @migratetype: migrate type to set in error recovery.

* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in

* the range will never be allocated. Any free pages and pages freed in the

* future will not be allocated again.

* start_pfn/end_pfn must be aligned to pageblock_order.

* Returns 0 on success and -EBUSY if any part of range cannot be isolated.

int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,

unsigned migratetype)

{

unsigned long pfn;

unsigned long undo_pfn;

struct page *page;

BUG_ON((start_pfn) & (pageblock_nr_pages - 1));

BUG_ON((end_pfn) & (pageblock_nr_pages - 1));

for (pfn = start_pfn;

pfn < end_pfn;

pfn += pageblock_nr_pages) {

page = __first_valid_page(pfn, pageblock_nr_pages);

if (page && set_migratetype_isolate(page)) {

undo_pfn = pfn;

goto undo;

}

return 0;

undo:

for (pfn = start_pfn;

pfn < undo_pfn;

pfn += pageblock_nr_pages)

unset_migratetype_isolate(pfn_to_page(pfn), migratetype);

return -EBUSY;

}

接下來調用__alloc_contig_migrate_range()進行頁面隔離和遷移:

static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)

{

/* This function is based on compact_zone() from compaction.c. */

unsigned long pfn = start;

unsigned int tries = 0;

int ret = 0;

struct compact_control cc = {

.nr_migratepages = 0,

.order = -1,

.zone = page_zone(pfn_to_page(start)),

.sync = true,

};

INIT_LIST_HEAD(&cc.migratepages);

migrate_prep_local();

while (pfn < end || !list_empty(&cc.migratepages)) {

if (fatal_signal_pending(current)) {

ret = -EINTR;

break;

}

if (list_empty(&cc.migratepages)) {

cc.nr_migratepages = 0;

pfn = isolate_migratepages_range(cc.zone, &cc,

pfn, end);

if (!pfn) {

ret = -EINTR;

break;

}

tries = 0;

} else if (++tries == 5) {

ret = ret < 0 ? ret : -EBUSY;

break;

}

ret = migrate_pages(&cc.migratepages,

__alloc_contig_migrate_alloc,

0, false, true);

}

putback_lru_pages(&cc.migratepages);

return ret > 0 ? 0 : ret;

}

其中的函數migrate_pages()會完成頁面的遷移，遷移過程中通過傳入的__alloc_contig_migrate_alloc()申請新的page，并將老的page付給新的page：

int migrate_pages(struct list_head *from,

new_page_t get_new_page, unsigned long private, bool offlining,

bool sync)

{

int retry = 1;

int nr_failed = 0;

int pass = 0;

struct page *page;

struct page *page2;

int swapwrite = current->flags & PF_SWAPWRITE;

int rc;

if (!swapwrite)

current->flags |= PF_SWAPWRITE;

for(pass = 0; pass < 10 && retry; pass++) {

retry = 0;

list_for_each_entry_safe(page, page2, from, lru) {

cond_resched();

rc = unmap_and_move(get_new_page, private,

page, pass > 2, offlining,

sync);

switch(rc) {

case -ENOMEM:

goto out;

case -EAGAIN:

retry++;

break;

case 0:

break;

default:

/* Permanent failure */

nr_failed++;

break;

}

rc = 0;

...

}

其中的unmap_and_move()函數較為關鍵，它定義在mm/migrate.c中

* Obtain the lock on page, remove all ptes and migrate the page

* to the newly allocated page in newpage.

static int unmap_and_move(new_page_t get_new_page, unsigned long private,

struct page *page, int force, bool offlining, bool sync)

{

int rc = 0;

int *result = NULL;

struct page *newpage = get_new_page(page, private, &result);

int remap_swapcache = 1;

int charge = 0;

struct mem_cgroup *mem = NULL;

struct anon_vma *anon_vma = NULL;

...

/* charge against new page */

charge = mem_cgroup_prepare_migration(page, newpage, &mem);

...

if (PageWriteback(page)) {

if (!force || !sync)

goto uncharge;

wait_on_page_writeback(page);

}

* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,

* we cannot notice that anon_vma is freed while we migrates a page.

* This get_anon_vma() delays freeing anon_vma pointer until the end

* of migration. File cache pages are no problem because of page_lock()

* File Caches may use write_page() or lock_page() in migration, then,

* just care Anon page here.

if (PageAnon(page)) {

* Only page_lock_anon_vma() understands the subtleties of

* getting a hold on an anon_vma from outside one of its mms.

anon_vma = page_lock_anon_vma(page);

if (anon_vma) {

* Take a reference count on the anon_vma if the

* page is mapped so that it is guaranteed to

* exist when the page is remapped later

get_anon_vma(anon_vma);

page_unlock_anon_vma(anon_vma);

} else if (PageSwapCache(page)) {

* We cannot be sure that the anon_vma of an unmapped

* swapcache page is safe to use because we don't

* know in advance if the VMA that this page belonged

* to still exists. If the VMA and others sharing the

* data have been freed, then the anon_vma could

* already be invalid.

* To avoid this possibility, swapcache pages get

* migrated but are not remapped when migration

* completes

remap_swapcache = 0;

} else {

goto uncharge;

}

...

/* Establish migration ptes or remove ptes */

try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);

skip_unmap:

if (!page_mapped(page))

rc = move_to_new_page(newpage, page, remap_swapcache);

if (rc && remap_swapcache)

remove_migration_ptes(page, page);

/* Drop an anon_vma reference if we took one */

if (anon_vma)

drop_anon_vma(anon_vma);

uncharge:

if (!charge)

mem_cgroup_end_migration(mem, page, newpage, rc == 0);

unlock:

unlock_page(page);

move_newpage:

...

}

通過unmap_and_move()，老的page就被遷移過去新的page。

接下來要回收page，回收page的作用是，不至于因為拿了連續的內存后，系統變得內存饑餓：

* Reclaim enough pages to make sure that contiguous allocation

* will not starve the system.

__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);

* Trigger memory pressure bump to reclaim some pages in order to be able to

* allocate 'count' pages in single page units. Does similar work as

*__alloc_pages_slowpath() function.

static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)

{

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

struct zonelist *zonelist = node_zonelist(0, gfp_mask);

int did_some_progress = 0;

int order = 1;

unsigned long watermark;

* Increase level of watermarks to force kswapd do his job

* to stabilise at new watermark level.

__update_cma_watermarks(zone, count);

/* Obey watermarks as if the page was being allocated */

watermark = low_wmark_pages(zone) + count;

while (!zone_watermark_ok(zone, 0, watermark, 0, 0)) {

wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));

did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

NULL);

if (!did_some_progress) {

/* Exhausted what can be done so it's blamo time */

out_of_memory(zonelist, gfp_mask, order, NULL);

}

/* Restore original watermark levels. */

__update_cma_watermarks(zone, -count);

return count;

}

釋放連續內存

內存釋放的時候也比較簡單，直接就是：

arch/arm/mm/dma-mapping.c：

void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)

arch/arm/mm/dma-mapping.c:

static void __free_from_contiguous(struct device *dev, struct page *page,

size_t size)

{

__dma_remap(page, size, pgprot_kernel);

dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);

}

bool dma_release_from_contiguous(struct device *dev, struct page *pages,

int count)

{

...

free_contig_range(pfn, count);

}

void free_contig_range(unsigned long pfn, unsigned nr_pages)

{

for (; nr_pages--; ++pfn)

__free_page(pfn_to_page(pfn));

}

將page交還給buddy。

內核內存分配的migratetype

內核內存分配的時候，帶的標志是GFP_，但是GFP_可以轉化為migratetype：

static inline int allocflags_to_migratetype(gfp_t gfp_flags)

{

WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);

if (unlikely(page_group_by_mobility_disabled))

return MIGRATE_UNMOVABLE;

/* Group based on mobility */

return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |

((gfp_flags & __GFP_RECLAIMABLE) != 0);

}

之后申請內存的時候，會對比遷移類型匹配的free_list：

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,

preferred_zone, migratetype);

另外，筆者也編寫了一個測試程序，透過它隨時測試CMA的功能：

* kernel module helper for testing CMA

* Licensed under GPLv2 or later.

#include <linux/module.h>

#include <linux/device.h>

#include <linux/fs.h>

#include <linux/miscdevice.h>

#include <linux/dma-mapping.h>

#define CMA_NUM 10

static struct device *cma_dev;

static dma_addr_t dma_phys[CMA_NUM];

static void *dma_virt[CMA_NUM];

/* any read request will free coherent memory, eg.

* cat /dev/cma_test

static ssize_t

cma_test_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)

{

int i;

for (i = 0; i < CMA_NUM; i++) {

if (dma_virt[i]) {

dma_free_coherent(cma_dev, (i + 1) * SZ_1M, dma_virt[i], dma_phys[i]);

_dev_info(cma_dev, "free virt: %p phys: %p\n", dma_virt[i], (void *)dma_phys[i]);

dma_virt[i] = NULL;

break;

}

return 0;

}

* any write request will alloc coherent memory, eg.

* echo 0 > /dev/cma_test

static ssize_t

cma_test_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)

{

int i;

int ret;

for (i = 0; i < CMA_NUM; i++) {

if (!dma_virt[i]) {

dma_virt[i] = dma_alloc_coherent(cma_dev, (i + 1) * SZ_1M, &dma_phys[i], GFP_KERNEL);

if (dma_virt[i]) {

void *p;

/* touch every page in the allocated memory */

for (p = dma_virt[i]; p < dma_virt[i] + (i + 1) * SZ_1M; p += PAGE_SIZE)

*(u32 *)p = 0;

_dev_info(cma_dev, "alloc virt: %p phys: %p\n", dma_virt[i], (void *)dma_phys[i]);

} else {

dev_err(cma_dev, "no mem in CMA area\n");

ret = -ENOMEM;

}

break;

}

return count;

}

static const struct file_operations cma_test_fops = {

.owner = THIS_MODULE,

.read = cma_test_read,

.write = cma_test_write,

};

static struct miscdevice cma_test_misc = {

.name = "cma_test",

.fops = &cma_test_fops,

};

static int __init cma_test_init(void)

{

int ret = 0;

ret = misc_register(&cma_test_misc);

if (unlikely(ret)) {

pr_err("failed to register cma test misc device!\n");

return ret;

}

cma_dev = cma_test_misc.this_device;

cma_dev->coherent_dma_mask = ~0;

_dev_info(cma_dev, "registered.\n");

return ret;

}

module_init(cma_test_init);

static void __exit cma_test_exit(void)

{

misc_deregister(&cma_test_misc);

}

module_exit(cma_test_exit);

MODULE_LICENSE("GPL");

MODULE_AUTHOR("Barry Song <21cnbao@gmail.com>");

MODULE_DESCRIPTION("kernel module to help the test of CMA");

MODULE_ALIAS("CMA test");

申請內存：

# echo 0 > /dev/cma_test

釋放內存：

# cat /dev/cma_test

參考鏈接：

[1] http://www.spinics.net/lists/arm-kernel/msg160854.html

[2] http://www.spinics.net/lists/arm-kernel/msg162063.html

[3] http://lwn.net/Articles/447405/

轉載于:https://www.cnblogs.com/sky-heaven/p/9549482.html

總結

以上是生活随笔為你收集整理的Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： js的数据类型，以及如何判断它们是哪种类
下一篇： Linux 系统内存分析

日韩av黄I国产麻豆传媒I国产91av视频在线观看I日韩一区二区三区在线看I美女国产在线I麻豆视频国产在线观看I成人黄色短片

Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】

聲明連續內存

申請連續內存

釋放連續內存

內核內存分配的migratetype

總結