Linux物理内存管理区在start_kernel函数中进行初始化,此时启动分配器已经建立,所以可以从bootmem中分配需要的内存。
一、全局变量初始化
max_pfn:最大物理页面帧号
start_kernel()->setup_arch()->e820_end_of_ram_pfn()找出最大可用内存页面帧号。
- void __init setup_arch(char **cmdline_p)
- {
- ……
- /*
- * partially used pages are not usable – thus
- * we are rounding upwards:
- */
- /*遍历e820.map,找到系统中得最大内存数,
- 这个内存数需小于4G*/
- max_pfn = e820_end_of_ram_pfn();
- ……
- }
- unsigned long __init e820_end_of_ram_pfn(void)
- {
- /*MAX_ARCH_PFN为4G空间*/
- return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
- }
- /*
- * Find the highest page frame number we have available
- */
- static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
- {
- int i;
- unsigned long last_pfn = 0;
- unsigned long max_arch_pfn = MAX_ARCH_PFN;/*4G地址空间对应的页面数*/
- /*对e820中所有的内存块,其中e820为从bios中探测到的页面数存放处*/
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];/*第i个物理页面块*/
- unsigned long start_pfn;
- unsigned long end_pfn;
- if (ei->type != type)/*与找的类型不匹配*/
- continue;
- /*起始地址对应的页面帧号*/
- start_pfn = ei->addr >> PAGE_SHIFT;
- /*结束物理地址对应的页面帧号*/
- end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
- if (start_pfn >= limit_pfn)
- continue;
- if (end_pfn > limit_pfn) {
- last_pfn = limit_pfn;/*找到的结束页面帧号大于限制大小时*/
- break;
- }
- if (end_pfn > last_pfn)
- last_pfn = end_pfn;/*保存更新last_pfn*/
- }
- if (last_pfn > max_arch_pfn)/*大于4G空间时*/
- last_pfn = max_arch_pfn;
- /*打印输出信息*/
- printk(KERN_INFO “last_pfn = %#lx max_arch_pfn = %#lx\n”,
- last_pfn, max_arch_pfn);
- /*返回最后一个页面帧号*/
- return last_pfn;
- }
max_low_pfn:低端内存最大页面数
start_kernel()->setup_arch()->find_low_pfn_range()
- /*
- * Determine low and high memory ranges:
- */
- /*找到低端内存的做大内存页面数,初始化两个变量*/
- void __init find_low_pfn_range(void)
- {
- /* it could update max_pfn */
- /*当内存的大小本来就小于低端内存的做大页框数时;
- 直接没有高端地址映射*/
- if (max_pfn <= MAXMEM_PFN)
- lowmem_pfn_init();
- else/*这是一般PC机的运行流程,存在高端映射*/
- highmem_pfn_init();
- }
我们直接看具有高端地址空间的部分。
- /*
- * We have more RAM than fits into lowmem – we try to put it into
- * highmem, also taking the highmem=x boot parameter into account:
- */
- /*高端地址空间的页面数可以在启动中进行配置;
- 如果不配置,在这里进行设置大小*/
- void __init highmem_pfn_init(void)
- {
- /*MAXMEM_PFN为最大物理地址-(4M+4M+8K+128M);
- 所以低端内存的大小其实比我们说的896M低一些*/
- max_low_pfn = MAXMEM_PFN;
- if (highmem_pages == -1)/*高端内存页面数如果在开机没有设置*/
- highmem_pages = max_pfn – MAXMEM_PFN;/*总页面数减去低端页面数*/
- /*如果highmem_pages变量在启动项设置了,那么在这里就要进行这样的判断,因为可能出现不一致的情况*/
- if (highmem_pages + MAXMEM_PFN < max_pfn)
- max_pfn = MAXMEM_PFN + highmem_pages;
- if (highmem_pages + MAXMEM_PFN > max_pfn) {
- printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
- pages_to_mb(max_pfn – MAXMEM_PFN),
- pages_to_mb(highmem_pages));
- highmem_pages = 0;
- }
- #ifndef CONFIG_HIGHMEM
- /* Maximum memory usable is what is directly addressable */
- printk(KERN_WARNING “Warning only %ldMB will be used.\n”, MAXMEM>>20);
- if (max_pfn > MAX_NONPAE_PFN)
- printk(KERN_WARNING “Use a HIGHMEM64G enabled kernel.\n”);
- else
- printk(KERN_WARNING “Use a HIGHMEM enabled kernel.\n”);
- max_pfn = MAXMEM_PFN;
- #else /* !CONFIG_HIGHMEM *//*存在高端地址情况*/
- #ifndef CONFIG_HIGHMEM64G
- /*在没有配置64G的情况下,内存的大小不能超过4G*/
- if (max_pfn > MAX_NONPAE_PFN) {
- max_pfn = MAX_NONPAE_PFN;
- printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
- }
- #endif /* !CONFIG_HIGHMEM64G */
- #endif /* !CONFIG_HIGHMEM */
- }
二、管理区初始化
Start_kernl()->setup_arch()->paging_init()->zone_sizes_init()
- static void __init zone_sizes_init(void)
- {
- /*初始化几个内存区中的最大页面数,在后面用于具体的初始化工作*/
- unsigned long max_zone_pfns[MAX_NR_ZONES];
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] =/*DMA区的最大页面帧号,后面的类似*/
- virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
- #ifdef CONFIG_HIGHMEM
- max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
- #endif
- /*内存体系的MMU建立,包括伙伴系统的初步建立*/
- free_area_init_nodes(max_zone_pfns);
- }
其中x86-32 非PAE下MAX_DMA_ADDRESS为16M+3G大小
- /* The maximum address that we can perform a DMA transfer to on this platform */
- #define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
- /**
- * free_area_init_nodes – Initialise all pg_data_t and zone data
- * @max_zone_pfn: an array of max PFNs for each zone
- *
- * This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by add_active_range(), the size of each
- * zone in each node and their holes is calculated. If the maximum PFN
- * between two adjacent zones match, it is assumed that the zone is empty.
- * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
- * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
- * starts where the previous one ended. For example, ZONE_DMA32 starts
- * at arch_max_dma_pfn.
- */
- void __init free_area_init_nodes(unsigned long *max_zone_pfn)
- {
- unsigned long nid;
- int i;
- /* Sort early_node_map as initialisation assumes it is sorted */
- /*将活动区域进行排序,关于活动区域在后面会有介绍*/
- sort_node_map();
- /* Record where the zone boundaries are */
- memset(arch_zone_lowest_possible_pfn, 0,
- sizeof(arch_zone_lowest_possible_pfn));
- memset(arch_zone_highest_possible_pfn, 0,
- sizeof(arch_zone_highest_possible_pfn));
- /*找出活动内存中最小的页面,在代码中的作者的注释很详细*/
- arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
- arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
- for (i = 1; i < MAX_NR_ZONES; i++) {
- if (i == ZONE_MOVABLE)
- continue;
- arch_zone_lowest_possible_pfn[i] =
- arch_zone_highest_possible_pfn[i-1];/*假定区域连续,下一个区域的最小页面为上一个区的最后页面*/
- arch_zone_highest_possible_pfn[i] =
- max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
- }
- /*对ZONE_MOVABLE区域设置为0*/
- arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
- arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
- /* Find the PFNs that ZONE_MOVABLE begins at in each node */
- memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
- find_zone_movable_pfns_for_nodes(zone_movable_pfn);/*找出每个区的movable的页面数,关于movable为新引入的机制,在后面的文章中会对其详细分析*/
- /* Print out the zone ranges */
- printk(“Zone PFN ranges:\n”);
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (i == ZONE_MOVABLE)
- continue;
- printk(” %-8s %0#10lx -> %0#10lx\n”,
- zone_names[i],
- arch_zone_lowest_possible_pfn[i],
- arch_zone_highest_possible_pfn[i]);
- }
- /* Print out the PFNs ZONE_MOVABLE begins at in each node */
- printk(“Movable zone start PFN for each node\n”);
- for (i = 0; i < MAX_NUMNODES; i++) {
- if (zone_movable_pfn[i])
- printk(” Node %d: %lu\n”, i, zone_movable_pfn[i]);
- }
- /* Print out the early_node_map[] */
- printk(“early_node_map[%d] active PFN ranges\n”, nr_nodemap_entries);
- for (i = 0; i < nr_nodemap_entries; i++)
- printk(” %3d: %0#10lx -> %0#10lx\n”, early_node_map[i].nid,
- early_node_map[i].start_pfn,
- early_node_map[i].end_pfn);
- /* Initialise every node */
- /*调试用*/
- mminit_verify_pageflags_layout();
- setup_nr_node_ids();
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
- /*zone中数据的初始化,伙伴系统建立但是没有页面
- 和数据,页面在后面的mem_init中得到*/
- free_area_init_node(nid, NULL,
- find_min_pfn_for_node(nid), NULL);
- /* Any memory on that node */
- if (pgdat->node_present_pages)
- node_set_state(nid, N_HIGH_MEMORY);
- /*内存的相关检查*/
- check_for_regular_memory(pgdat);
- }
- }
- void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
- unsigned long node_start_pfn, unsigned long *zholes_size)
- {
- pg_data_t *pgdat = NODE_DATA(nid);
- pgdat->node_id = nid;
- pgdat->node_start_pfn = node_start_pfn;/*这个在前面调用一个函数得到*/
- /*计算系统中节点nid的所有物理页面保存在数据结构中*/
- calculate_node_totalpages(pgdat, zones_size, zholes_size);
- /*当节点只有一个时,将节点的map保存到全局变量中*/
- alloc_node_mem_map(pgdat);
- #ifdef CONFIG_FLAT_NODE_MEM_MAP
- printk(KERN_DEBUG “free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n”,
- nid, (unsigned long)pgdat,
- (unsigned long)pgdat->node_mem_map);
- #endif
- /*zone中相关数据的初始化,包括伙伴系统,等待队列,相关变量,
- 数据结构、链表等;*/
- free_area_init_core(pgdat, zones_size, zholes_size);
- }
具体的区域的初始化在下面函数进行
- /*
- * Set up the zone data structures:
- * – mark all pages reserved
- * – mark all memory queues empty
- * – clear the memory bitmaps
- */
- static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long *zholes_size)
- {
- enum zone_type j;
- int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
- int ret;
- pgdat_resize_init(pgdat);
- pgdat->nr_zones = 0;
- init_waitqueue_head(&pgdat->kswapd_wait);
- pgdat->kswapd_max_order = 0;
- pgdat_page_cgroup_init(pgdat);
- for (j = 0; j < MAX_NR_ZONES; j++) {
- struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize, memmap_pages;
- enum lru_list l;
- /*下面的两个函数会获得指定节点的真实内存大小*/
- size = zone_spanned_pages_in_node(nid, j, zones_size);
- realsize = size – zone_absent_pages_in_node(nid, j,
- zholes_size);
- /*
- * Adjust realsize so that it accounts for how much memory
- * is used by this zone for memmap. This affects the watermark
- * and per-cpu initialisations
- */
- memmap_pages =/*存放页面所需要的内存大小*/
- PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
- if (realsize >= memmap_pages) {
- realsize -= memmap_pages;
- if (memmap_pages)
- printk(KERN_DEBUG
- ” %s zone: %lu pages used for memmap\n”,
- zone_names[j], memmap_pages);
- } else
- printk(KERN_WARNING
- ” %s zone: %lu pages exceeds realsize %lu\n”,
- zone_names[j], memmap_pages, realsize);
- /* Account for reserved pages */
- if (j == 0 && realsize > dma_reserve) {
- realsize -= dma_reserve;/*减去为DMA保留的页面*/
- printk(KERN_DEBUG ” %s zone: %lu pages reserved\n”,
- zone_names[0], dma_reserve);
- }
- if (!is_highmem_idx(j))/*如果不是高端内存区*/ nr_kernel_pages += realsize;
- nr_all_pages += realsize;
- /*下面为初始化zone结构的相关变量*/
- zone->spanned_pages = size;
- zone->present_pages = realsize;
- #ifdef CONFIG_NUMA
- zone->node = nid;
- zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
- / 100;
- zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
- #endif
- zone->name = zone_names[j];
- spin_lock_init(&zone->lock);
- spin_lock_init(&zone->lru_lock);
- zone_seqlock_init(zone);
- zone->zone_pgdat = pgdat;
- zone->prev_priority = DEF_PRIORITY;
- zone_pcp_init(zone);
- for_each_lru(l) {//初始化链表
- INIT_LIST_HEAD(&zone->lru[l].list);
- zone->reclaim_stat.nr_saved_scan[l] = 0;
- }
- zone->reclaim_stat.recent_rotated[0] = 0;
- zone->reclaim_stat.recent_rotated[1] = 0;
- zone->reclaim_stat.recent_scanned[0] = 0;
- zone->reclaim_stat.recent_scanned[1] = 0;
- zap_zone_vm_stats(zone);/*将变量zone->vm_stat变量置0*/
- zone->flags = 0;
- if (!size)
- continue;
- /*需要定义相关宏该版本没定义*/
- set_pageblock_order(pageblock_default_order());
- /zone中变量pageblock_flags内存申请,从启动分配器中*/
- setup_usemap(pgdat, zone, size);
- /*zone中的任务等待队列和zone的伙伴系统(MAX_ORDER个链表)的初始化,关于伙伴系统将单独在后面总结*/
- ret = init_currently_empty_zone(zone, zone_start_pfn,
- size, MEMMAP_EARLY);
- BUG_ON(ret);
- /*zone中page相关属性的初始化工作*/
- memmap_init(size, nid, j, zone_start_pfn);
- zone_start_pfn += size;
- }
- }
三、分配内存的备用区域初始化(非CONFIG_NUMA)
数据结构表示
- x; /* zone_idx(zoneref->zone) */
- };
- /*
- * One allocation request operates on a zonelist. A zonelist
- * is a list of zones, the first one is the ‘goal’ of the
- * allocation, the other zones are fallback zones, in decreasing
- * priority.
- *
- * If zlcache_ptr is not NULL, then it is just the address of zlcache,
- * as explained above. If zlcache_ptr is NULL, there is no zlcache.
- * *
- * To speed the reading of the zonelist, the zonerefs contain the zone index
- * of the entry being read. Helper functions to access information given
- * a struct zoneref are
- *
- * zonelist_zone() – Return the struct zone * for an entry in _zonerefs
- * zonelist_zone_idx() – Return the index of the zone for an entry
- * zonelist_node_idx() – Return the index of the node for an entry
- *///zone分配方案
- struct zonelist {
- struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
- struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
- #ifdef CONFIG_NUMA
- struct zonelist_cache zlcache; // optional …
- #endif
- };
代码中的英文注释很详细了
初始化
Start_kernel()->build_all_zonelists()
- void build_all_zonelists(void)
- {
- /*设置全局变量current_zonelist_order*/
- set_zonelist_order();
- if (system_state == SYSTEM_BOOTING) {
- /*对所有节点创建zonelists*/
- __build_all_zonelists(NULL);
- /*调试用*/
- mminit_verify_zonelist();
- cpuset_init_current_mems_allowed();
- } else {
- /* we have to stop all cpus to guarantee there is no user
- of zonelist */
- stop_machine(__build_all_zonelists, NULL, NULL);
- /* cpuset refresh routine should be here */
- }
- /*计算所有zone中可分配的页面数之和*/
- vm_total_pages = nr_free_pagecache_pages();
- /*
- * Disable grouping by mobility if the number of pages in the
- * system is too low to allow the mechanism to work. It would be
- * more accurate, but expensive to check per-zone. This check is
- * made on memory-hotadd so a system can start with mobility
- * disabled and enable it later
- */
- if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
- page_group_by_mobility_disabled = 1;
- else
- page_group_by_mobility_disabled = 0;
- printk(“Built %i zonelists in %s order, mobility grouping %s. “
- “Total pages: %ld\n”,
- nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
- page_group_by_mobility_disabled ? “off” : “on”,
- vm_total_pages);
- #ifdef CONFIG_NUMA
- printk(“Policy zone: %s\n”, zone_names[policy_zone]);
- #endif
- }
- /* return values int ….just for stop_machine() */
- static int __build_all_zonelists(void *dummy)
- {
- int nid;
- #ifdef CONFIG_NUMA
- memset(node_load, 0, sizeof(node_load));
- #endif
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
- /*创建zonelists,这个数组用来在分配内存中坐回绕,循环访问*/
- build_zonelists(pgdat);
- /*在UMA中,这个仅仅是把相关的变量设置成了NULL*/
- build_zonelist_cache(pgdat);
- }
- return 0;
- }
- static void build_zonelists(pg_data_t *pgdat)
- {
- int node, local_node;
- enum zone_type j;
- struct zonelist *zonelist;
- local_node = pgdat->node_id;
- zonelist = &pgdat->node_zonelists[0];
- /*将zone添加到zone链表中,这样,zone中page的
- 分配等操作将依靠这个环形的链表;*/
- j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES – 1);
- /*
- * Now we build the zonelist so that it contains the zones
- * of all the other nodes.
- * We don’t want to pressure a particular node, so when
- * building the zones for node N, we make sure that the
- * zones coming right after the local ones are those from
- * node N+1 (modulo N)
- *//*对其他在线的节点创建zonelist*/
- for (node = local_node + 1; node < MAX_NUMNODES; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j,
- MAX_NR_ZONES – 1);
- }
- for (node = 0; node < local_node; node++) {
- if (!node_online(node))
- continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j,
- MAX_NR_ZONES – 1);
- }
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
- }
- /*
- * Builds allocation fallback zone lists.
- *
- * Add all populated zones of a node to the zonelist.
- */
- static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
- int nr_zones, enum zone_type zone_type)
- {
- struct zone *zone;
- BUG_ON(zone_type >= MAX_NR_ZONES);
- zone_type++;
- do {
- zone_type–;
- zone = pgdat->node_zones + zone_type;
- if (populated_zone(zone)) {/*如果以页面为单位的管理区的总大小不为0*/
- zoneref_set_zone(zone,/*设置管理区链表,将相关信息加入*/
- &zonelist->_zonerefs[nr_zones++]);
- check_highest_zone(zone_type);
- }
- } while (zone_type);
- return nr_zones;
- }
内存管理区初始化主要是借助于启动分配器和以初始化的e820全局变量。内存管理区初始化后相应的伙伴系统、slab机制等等就可以在此基础上建立了,在后面会一点一点总结。