当前位置：服务器评测 > 教程资讯 > Linux运维 > 正文

Linux物理内存管理区初始化

2012-01-02 分类：Linux运维阅读(174) 评论(0)

Linux物理内存管理区在start_kernel函数中进行初始化，此时启动分配器已经建立，所以可以从bootmem中分配需要的内存。

一、全局变量初始化

max_pfn：最大物理页面帧号

start_kernel()->setup_arch()->e820_end_of_ram_pfn()找出最大可用内存页面帧号。

void __init setup_arch(char **cmdline_p)

{

……

/*

* partially used pages are not usable – thus

* we are rounding upwards:

*/

/*遍历e820.map，找到系统中得最大内存数，

这个内存数需小于4G*/

max_pfn = e820_end_of_ram_pfn();

……

｝

unsigned long __init e820_end_of_ram_pfn(void)

{

/*MAX_ARCH_PFN为4G空间*/

return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);

}

/*

* Find the highest page frame number we have available

*/

static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)

{

int i;

unsigned long last_pfn = 0;

unsigned long max_arch_pfn = MAX_ARCH_PFN;/*4G地址空间对应的页面数*/

/*对e820中所有的内存块,其中e820为从bios中探测到的页面数存放处*/

for (i = 0; i < e820.nr_map; i++) {

struct e820entry *ei = &e820.map[i];/*第i个物理页面块*/

unsigned long start_pfn;

unsigned long end_pfn;

if (ei->type != type)/*与找的类型不匹配*/

continue;

/*起始地址对应的页面帧号*/

start_pfn = ei->addr >> PAGE_SHIFT;

/*结束物理地址对应的页面帧号*/

end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;

if (start_pfn >= limit_pfn)

continue;

if (end_pfn > limit_pfn) {

last_pfn = limit_pfn;/*找到的结束页面帧号大于限制大小时*/

break;

}

if (end_pfn > last_pfn)

last_pfn = end_pfn;/*保存更新last_pfn*/

}

if (last_pfn > max_arch_pfn)/*大于4G空间时*/

last_pfn = max_arch_pfn;

/*打印输出信息*/

printk(KERN_INFO “last_pfn = %#lx max_arch_pfn = %#lx\n”,

last_pfn, max_arch_pfn);

/*返回最后一个页面帧号*/

return last_pfn;

}

max_low_pfn：低端内存最大页面数

start_kernel()->setup_arch()->find_low_pfn_range()

/*

* Determine low and high memory ranges:

*/

/*找到低端内存的做大内存页面数，初始化两个变量*/

void __init find_low_pfn_range(void)

{

/* it could update max_pfn */

/*当内存的大小本来就小于低端内存的做大页框数时；

直接没有高端地址映射*/

if (max_pfn <= MAXMEM_PFN)

lowmem_pfn_init();

else/*这是一般PC机的运行流程，存在高端映射*/

highmem_pfn_init();

}

我们直接看具有高端地址空间的部分。

/*

* We have more RAM than fits into lowmem – we try to put it into

* highmem, also taking the highmem=x boot parameter into account:

*/

/*高端地址空间的页面数可以在启动中进行配置；

如果不配置，在这里进行设置大小*/

void __init highmem_pfn_init(void)

{

/*MAXMEM_PFN为最大物理地址-(4M+4M+8K+128M);

所以低端内存的大小其实比我们说的896M低一些*/

max_low_pfn = MAXMEM_PFN;

if (highmem_pages == -1)/*高端内存页面数如果在开机没有设置*/

highmem_pages = max_pfn – MAXMEM_PFN;/*总页面数减去低端页面数*/

/*如果highmem_pages变量在启动项设置了，那么在这里就要进行这样的判断，因为可能出现不一致的情况*/

if (highmem_pages + MAXMEM_PFN < max_pfn)

max_pfn = MAXMEM_PFN + highmem_pages;

if (highmem_pages + MAXMEM_PFN > max_pfn) {

printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,

pages_to_mb(max_pfn – MAXMEM_PFN),

pages_to_mb(highmem_pages));

highmem_pages = 0;

}

#ifndef CONFIG_HIGHMEM

/* Maximum memory usable is what is directly addressable */

printk(KERN_WARNING “Warning only %ldMB will be used.\n”, MAXMEM>>20);

if (max_pfn > MAX_NONPAE_PFN)

printk(KERN_WARNING “Use a HIGHMEM64G enabled kernel.\n”);

else

printk(KERN_WARNING “Use a HIGHMEM enabled kernel.\n”);

max_pfn = MAXMEM_PFN;

#else /* !CONFIG_HIGHMEM *//*存在高端地址情况*/

#ifndef CONFIG_HIGHMEM64G

/*在没有配置64G的情况下，内存的大小不能超过4G*/

if (max_pfn > MAX_NONPAE_PFN) {

max_pfn = MAX_NONPAE_PFN;

printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);

}

#endif /* !CONFIG_HIGHMEM64G */

#endif /* !CONFIG_HIGHMEM */

}

二、管理区初始化

Start_kernl()->setup_arch()->paging_init()->zone_sizes_init()

static void __init zone_sizes_init(void)

{

/*初始化几个内存区中的最大页面数，在后面用于具体的初始化工作*/

unsigned long max_zone_pfns[MAX_NR_ZONES];

memset(max_zone_pfns, 0, sizeof(max_zone_pfns));

max_zone_pfns[ZONE_DMA] =/*DMA区的最大页面帧号，后面的类似*/

virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;

max_zone_pfns[ZONE_NORMAL] = max_low_pfn;

#ifdef CONFIG_HIGHMEM

max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;

#endif

/*内存体系的MMU建立，包括伙伴系统的初步建立*/

free_area_init_nodes(max_zone_pfns);

}

其中x86-32 非PAE下MAX_DMA_ADDRESS为16M+3G大小

/* The maximum address that we can perform a DMA transfer to on this platform */

#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)

/**

* free_area_init_nodes – Initialise all pg_data_t and zone data

* @max_zone_pfn: an array of max PFNs for each zone

*

* This will call free_area_init_node() for each active node in the system.

* Using the page ranges provided by add_active_range(), the size of each

* zone in each node and their holes is calculated. If the maximum PFN

* between two adjacent zones match, it is assumed that the zone is empty.

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

* starts where the previous one ended. For example, ZONE_DMA32 starts

* at arch_max_dma_pfn.

*/

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

{

unsigned long nid;

int i;

/* Sort early_node_map as initialisation assumes it is sorted */

/*将活动区域进行排序，关于活动区域在后面会有介绍*/

sort_node_map();

/* Record where the zone boundaries are */

memset(arch_zone_lowest_possible_pfn, 0,

sizeof(arch_zone_lowest_possible_pfn));

memset(arch_zone_highest_possible_pfn, 0,

sizeof(arch_zone_highest_possible_pfn));

/*找出活动内存中最小的页面，在代码中的作者的注释很详细*/

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

for (i = 1; i < MAX_NR_ZONES; i++) {

if (i == ZONE_MOVABLE)

continue;

arch_zone_lowest_possible_pfn[i] =

arch_zone_highest_possible_pfn[i-1];/*假定区域连续,下一个区域的最小页面为上一个区的最后页面*/

arch_zone_highest_possible_pfn[i] =

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

}

/*对ZONE_MOVABLE区域设置为0*/

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

find_zone_movable_pfns_for_nodes(zone_movable_pfn);/*找出每个区的movable的页面数，关于movable为新引入的机制，在后面的文章中会对其详细分析*/

/* Print out the zone ranges */

printk(“Zone PFN ranges:\n”);

for (i = 0; i < MAX_NR_ZONES; i++) {

if (i == ZONE_MOVABLE)

continue;

printk(” %-8s %0#10lx -> %0#10lx\n”,

zone_names[i],

arch_zone_lowest_possible_pfn[i],

arch_zone_highest_possible_pfn[i]);

}

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

printk(“Movable zone start PFN for each node\n”);

for (i = 0; i < MAX_NUMNODES; i++) {

if (zone_movable_pfn[i])

printk(” Node %d: %lu\n”, i, zone_movable_pfn[i]);

}

/* Print out the early_node_map[] */

printk(“early_node_map[%d] active PFN ranges\n”, nr_nodemap_entries);

for (i = 0; i < nr_nodemap_entries; i++)

printk(” %3d: %0#10lx -> %0#10lx\n”, early_node_map[i].nid,

early_node_map[i].start_pfn,

early_node_map[i].end_pfn);

/* Initialise every node */

/*调试用*/

mminit_verify_pageflags_layout();

setup_nr_node_ids();

for_each_online_node(nid) {

pg_data_t *pgdat = NODE_DATA(nid);

/*zone中数据的初始化，伙伴系统建立但是没有页面

和数据，页面在后面的mem_init中得到*/

free_area_init_node(nid, NULL,

find_min_pfn_for_node(nid), NULL);

/* Any memory on that node */

if (pgdat->node_present_pages)

node_set_state(nid, N_HIGH_MEMORY);

/*内存的相关检查*/

check_for_regular_memory(pgdat);

}

}

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

unsigned long node_start_pfn, unsigned long *zholes_size)

{

pg_data_t *pgdat = NODE_DATA(nid);

pgdat->node_id = nid;

pgdat->node_start_pfn = node_start_pfn;/*这个在前面调用一个函数得到*/

/*计算系统中节点nid的所有物理页面保存在数据结构中*/

calculate_node_totalpages(pgdat, zones_size, zholes_size);

/*当节点只有一个时，将节点的map保存到全局变量中*/

alloc_node_mem_map(pgdat);

#ifdef CONFIG_FLAT_NODE_MEM_MAP

printk(KERN_DEBUG “free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n”,

nid, (unsigned long)pgdat,

(unsigned long)pgdat->node_mem_map);

#endif

/*zone中相关数据的初始化，包括伙伴系统，等待队列，相关变量，

数据结构、链表等；*/

free_area_init_core(pgdat, zones_size, zholes_size);

}

具体的区域的初始化在下面函数进行

/*

* Set up the zone data structures:

* – mark all pages reserved

* – mark all memory queues empty

* – clear the memory bitmaps

*/

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

unsigned long *zones_size, unsigned long *zholes_size)

{

enum zone_type j;

int nid = pgdat->node_id;

unsigned long zone_start_pfn = pgdat->node_start_pfn;

int ret;

pgdat_resize_init(pgdat);

pgdat->nr_zones = 0;

init_waitqueue_head(&pgdat->kswapd_wait);

pgdat->kswapd_max_order = 0;

pgdat_page_cgroup_init(pgdat);

for (j = 0; j < MAX_NR_ZONES; j++) {

struct zone *zone = pgdat->node_zones + j;

unsigned long size, realsize, memmap_pages;

enum lru_list l;

/*下面的两个函数会获得指定节点的真实内存大小*/

size = zone_spanned_pages_in_node(nid, j, zones_size);

realsize = size – zone_absent_pages_in_node(nid, j,

zholes_size);

/*

* Adjust realsize so that it accounts for how much memory

* is used by this zone for memmap. This affects the watermark

* and per-cpu initialisations

*/

memmap_pages =/*存放页面所需要的内存大小*/

PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;

if (realsize >= memmap_pages) {

realsize -= memmap_pages;

if (memmap_pages)

printk(KERN_DEBUG

” %s zone: %lu pages used for memmap\n”,

zone_names[j], memmap_pages);

} else

printk(KERN_WARNING

” %s zone: %lu pages exceeds realsize %lu\n”,

zone_names[j], memmap_pages, realsize);

/* Account for reserved pages */

if (j == 0 && realsize > dma_reserve) {

realsize -= dma_reserve;/*减去为DMA保留的页面*/

printk(KERN_DEBUG ” %s zone: %lu pages reserved\n”,

zone_names[0], dma_reserve);

}

if (!is_highmem_idx(j))/*如果不是高端内存区*/ nr_kernel_pages += realsize;

nr_all_pages += realsize;

/*下面为初始化zone结构的相关变量*/

zone->spanned_pages = size;

zone->present_pages = realsize;

#ifdef CONFIG_NUMA

zone->node = nid;

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

/ 100;

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

#endif

zone->name = zone_names[j];

spin_lock_init(&zone->lock);

spin_lock_init(&zone->lru_lock);

zone_seqlock_init(zone);

zone->zone_pgdat = pgdat;

zone->prev_priority = DEF_PRIORITY;

zone_pcp_init(zone);

for_each_lru(l) {//初始化链表

INIT_LIST_HEAD(&zone->lru[l].list);

zone->reclaim_stat.nr_saved_scan[l] = 0;

}

zone->reclaim_stat.recent_rotated[0] = 0;

zone->reclaim_stat.recent_rotated[1] = 0;

zone->reclaim_stat.recent_scanned[0] = 0;

zone->reclaim_stat.recent_scanned[1] = 0;

zap_zone_vm_stats(zone);/*将变量zone->vm_stat变量置0*/

zone->flags = 0;

if (!size)

continue;

/*需要定义相关宏该版本没定义*/

set_pageblock_order(pageblock_default_order());

/zone中变量pageblock_flags内存申请，从启动分配器中*/

setup_usemap(pgdat, zone, size);

/*zone中的任务等待队列和zone的伙伴系统(MAX_ORDER个链表)的初始化，关于伙伴系统将单独在后面总结*/

ret = init_currently_empty_zone(zone, zone_start_pfn,

size, MEMMAP_EARLY);

BUG_ON(ret);

/*zone中page相关属性的初始化工作*/

memmap_init(size, nid, j, zone_start_pfn);

zone_start_pfn += size;

}

}

三、分配内存的备用区域初始化(非CONFIG_NUMA）

数据结构表示

x; /* zone_idx(zoneref->zone) */

};

/*

* One allocation request operates on a zonelist. A zonelist

* is a list of zones, the first one is the ‘goal’ of the

* allocation, the other zones are fallback zones, in decreasing

* priority.

*

* If zlcache_ptr is not NULL, then it is just the address of zlcache,

* as explained above. If zlcache_ptr is NULL, there is no zlcache.

* *

* To speed the reading of the zonelist, the zonerefs contain the zone index

* of the entry being read. Helper functions to access information given

* a struct zoneref are

*

* zonelist_zone() – Return the struct zone * for an entry in _zonerefs

* zonelist_zone_idx() – Return the index of the zone for an entry

* zonelist_node_idx() – Return the index of the node for an entry

*///zone分配方案

struct zonelist {

struct zonelist_cache *zlcache_ptr; // NULL or &zlcache

struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];

#ifdef CONFIG_NUMA

struct zonelist_cache zlcache; // optional …

#endif

};

代码中的英文注释很详细了

初始化

Start_kernel()->build_all_zonelists()

void build_all_zonelists(void)

{

/*设置全局变量current_zonelist_order*/

set_zonelist_order();

if (system_state == SYSTEM_BOOTING) {

/*对所有节点创建zonelists*/

__build_all_zonelists(NULL);

/*调试用*/

mminit_verify_zonelist();

cpuset_init_current_mems_allowed();

} else {

/* we have to stop all cpus to guarantee there is no user

of zonelist */

stop_machine(__build_all_zonelists, NULL, NULL);

/* cpuset refresh routine should be here */

}

/*计算所有zone中可分配的页面数之和*/

vm_total_pages = nr_free_pagecache_pages();

/*

* Disable grouping by mobility if the number of pages in the

* system is too low to allow the mechanism to work. It would be

* more accurate, but expensive to check per-zone. This check is

* made on memory-hotadd so a system can start with mobility

* disabled and enable it later

*/

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

page_group_by_mobility_disabled = 1;

else

page_group_by_mobility_disabled = 0;

printk(“Built %i zonelists in %s order, mobility grouping %s. “

“Total pages: %ld\n”,

nr_online_nodes,

zonelist_order_name[current_zonelist_order],

page_group_by_mobility_disabled ? “off” : “on”,

vm_total_pages);

#ifdef CONFIG_NUMA

printk(“Policy zone: %s\n”, zone_names[policy_zone]);

#endif

}

/* return values int ….just for stop_machine() */

static int __build_all_zonelists(void *dummy)

{

int nid;

#ifdef CONFIG_NUMA

memset(node_load, 0, sizeof(node_load));

#endif

for_each_online_node(nid) {

pg_data_t *pgdat = NODE_DATA(nid);

/*创建zonelists，这个数组用来在分配内存中坐回绕，循环访问*/

build_zonelists(pgdat);

/*在UMA中，这个仅仅是把相关的变量设置成了NULL*/

build_zonelist_cache(pgdat);

}

return 0;

}

static void build_zonelists(pg_data_t *pgdat)

{

int node, local_node;

enum zone_type j;

struct zonelist *zonelist;

local_node = pgdat->node_id;

zonelist = &pgdat->node_zonelists[0];

/*将zone添加到zone链表中，这样，zone中page的

分配等操作将依靠这个环形的链表；*/

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES – 1);

/*

* Now we build the zonelist so that it contains the zones

* of all the other nodes.

* We don’t want to pressure a particular node, so when

* building the zones for node N, we make sure that the

* zones coming right after the local ones are those from

* node N+1 (modulo N)

*//*对其他在线的节点创建zonelist*/

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

if (!node_online(node))

continue;

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

MAX_NR_ZONES – 1);

}

for (node = 0; node < local_node; node++) {

if (!node_online(node))

continue;

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

MAX_NR_ZONES – 1);

}

zonelist->_zonerefs[j].zone = NULL;

zonelist->_zonerefs[j].zone_idx = 0;

}

/*

* Builds allocation fallback zone lists.

*

* Add all populated zones of a node to the zonelist.

*/

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

int nr_zones, enum zone_type zone_type)

{

struct zone *zone;

BUG_ON(zone_type >= MAX_NR_ZONES);

zone_type++;

do {

zone_type–;

zone = pgdat->node_zones + zone_type;

if (populated_zone(zone)) {/*如果以页面为单位的管理区的总大小不为0*/

zoneref_set_zone(zone,/*设置管理区链表，将相关信息加入*/

&zonelist->_zonerefs[nr_zones++]);

check_highest_zone(zone_type);

}

} while (zone_type);

return nr_zones;

}

内存管理区初始化主要是借助于启动分配器和以初始化的e820全局变量。内存管理区初始化后相应的伙伴系统、slab机制等等就可以在此基础上建立了，在后面会一点一点总结。

赞(0) 打赏

转载请注明出处：服务器评测 » Linux物理内存管理区初始化

标签：内存初始化变量页面

相关推荐

QQ咨询
QQ咨询
回顶
回顶部