Linux内核中创建cache节点由函数kmem_cache_create()实现。
该函数的执行流程:
1,从全局cache_cache中获得cache结构,因为全局cache_cache初始化对象的大小就是kmem_cache结构的大小,所以返回的指针正好可以转换为cache结构;调用 kmem_cache_zalloc(&cache_cache, gfp);
2,获得slab中碎片大小,由函数calculate_slab_order()实现;
3,计算并初始化cache的各种属性,如果是外置式,需要用kmem_find_general_cachep(slab_size, 0u)指定cachep->slabp_cache,用于存放slab对象和kmem_bufctl_t[]数组;
4,设置每个CPU上得本地cache,setup_cpu_cache();
5,cache创建完毕,将其加入到全局slab cache链表中;
一、主实现
[cpp]
- /**
- * kmem_cache_create – Create a cache.
- * @name: A string which is used in /proc/slabinfo to identify this cache.
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
- * @flags: SLAB flags
- * @ctor: A constructor for the objects.
- *
- * Returns a ptr to the cache on success, NULL on failure.
- * Cannot be called within a int, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache.
- *
- * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting unloaded.
- * Note that kmem_cache_name() is not guaranteed to return the same pointer,
- * therefore applications must manage it themselves.
- *
- * The flags are
- *
- * %SLAB_POISON – Poison the slab with a known test pattern (a5a5a5a5)
- * to catch references to uninitialised memory.
- *
- * %SLAB_RED_ZONE – Insert `Red’ zones around the allocated memory to check
- * for buffer overruns.
- *
- * %SLAB_HWCACHE_ALIGN – Align the objects in this cache to a hardware
- * cacheline. This can be beneficial if you’re counting cycles as closely
- * as davem.
- */
- /*创建slab系统顶层的cache节点。创建完成后,cache
- 里并没有任何slab以及对象,只有当分配对象
- ,并且cache中没有空闲对象时,才会创建新的slab。*/
- struct kmem_cache *
- kmem_cache_create (const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
- {
- size_t left_over, slab_size, ralign;
- struct kmem_cache *cachep = NULL, *pc;
- gfp_t gfp;
- /*
- * Sanity checks… these are all serious usage bugs.
- *//* 安全性检查 */
- if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
- size > KMALLOC_MAX_SIZE) {
- printk(KERN_ERR “%s: Early error in slab %s\n”, __func__,
- name);
- BUG();
- }
- /*
- * We use cache_chain_mutex to ensure a consistent view of
- * cpu_online_mask as well. Please see cpuup_callback
- */
- /* slab分配器是否已经初始化好,如果是内核启动阶段
- ,则只有一个cpu执行slab分配器的初始化动作,无需加锁,否则需要加锁 */
- if (slab_is_available()) {
- get_online_cpus();
- mutex_lock(&cache_chain_mutex);
- }
- /* 遍历cache链,做些校验工作 */
- list_for_each_entry(pc, &cache_chain, next) {
- char tmp;
- int res;
- /*
- * This happens when the module gets unloaded and doesn’t
- * destroy its slab cache and no-one else reuses the vmalloc
- * area of the module. Print a warning.
- */
- /* 检查cache链表中的cache是否都有名字 */
- res = probe_kernel_address(pc->name, tmp);
- if (res) {/*没有名字,报错*/
- printk(KERN_ERR
- “SLAB: cache with size %d has lost its name\n”,
- pc->buffer_size);
- continue;
- }
- /* 检查cache链表中是否已经存在相同名字的cache */
- if (!strcmp(pc->name, name)) {
- printk(KERN_ERR
- “kmem_cache_create: duplicate cache %s\n”, name);
- dump_stack();
- goto oops;
- }
- }
- #if DEBUG
- WARN_ON(strchr(name, ‘ ‘)); /* It confuses parsers */
- #if FORCED_DEBUG
- /*
- * Enable redzoning and last user accounting, except for caches with
- * large objects, if the increased size would increase the object size
- * above the next power of two: caches with object sizes just above a
- * power of two have a significant amount of internal fragmentation.
- */
- if (size < 4096 || fls(size – 1) == fls(size-1 + REDZONE_ALIGN +
- 2 * sizeof(unsigned long long)))
- flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
- if (!(flags & SLAB_DESTROY_BY_RCU))
- flags |= SLAB_POISON;
- #endif
- if (flags & SLAB_DESTROY_BY_RCU)
- BUG_ON(flags & SLAB_POISON);
- #endif
- /*
- * Always checks flags, a caller might be expecting debug support which
- * isn’t available.
- */
- BUG_ON(flags & ~CREATE_MASK);
- /*
- * Check that size is in terms of words. This is needed to avoid
- * unaligned accesses for some archs when redzoning is used, and makes
- * sure any on-slab bufctl’s are also correctly aligned.
- */
- if (size & (BYTES_PER_WORD – 1)) {
- size += (BYTES_PER_WORD – 1);
- size &= ~(BYTES_PER_WORD – 1);
- }
- /* calculate the final buffer alignment: */
- /* 1) arch recommendation: can be overridden for debug */
- if (flags & SLAB_HWCACHE_ALIGN) {
- /*
- * Default alignment: as specified by the arch code. Except if
- * an object is really small, then squeeze multiple objects into
- * one cacheline.
- */
- ralign = cache_line_size();
- while (size <= ralign / 2)
- ralign /= 2;
- } else {
- ralign = BYTES_PER_WORD;
- }
- /*
- * Redzoning and user store require word alignment or possibly larger.
- * Note this will be overridden by architecture or caller mandated
- * alignment if either is greater than BYTES_PER_WORD.
- */
- if (flags & SLAB_STORE_USER)
- ralign = BYTES_PER_WORD;
- if (flags & SLAB_RED_ZONE) {
- ralign = REDZONE_ALIGN;
- /* If redzoning, ensure that the second redzone is suitably
- * aligned, by adjusting the object size accordingly. */
- size += REDZONE_ALIGN – 1;
- size &= ~(REDZONE_ALIGN – 1);
- }
- /* 2) arch mandated alignment */
- if (ralign < ARCH_SLAB_MINALIGN) {
- ralign = ARCH_SLAB_MINALIGN;
- }
- /* 3) caller mandated alignment */
- if (ralign < align) {
- ralign = align;
- }
- /* disable debug if necessary */
- if (ralign > __alignof__(unsigned long long))
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
- /*
- * 4) Store it.
- */
- align = ralign;
- /* slab分配器是否已经可用 */
- if (slab_is_available())
- gfp = GFP_KERNEL;
- else
- /* slab初始化好之前,不允许阻塞,且只能在低端内存区分配 */
- gfp = GFP_NOWAIT;
- /* Get cache’s description obj. */
- /* 获得struct kmem_cache对象 ,为什么能从cache中获得的对象是
- kmem_cache结构呢,因为这里的全局变量cache_cache的对象大小
- 就是kmem_cache结构大小*/
- cachep = kmem_cache_zalloc(&cache_cache, gfp);
- if (!cachep)
- goto oops;
- #if DEBUG
- cachep->obj_size = size;
- /*
- * Both debugging options require word-alignment which is calculated
- * into align above.
- */
- if (flags & SLAB_RED_ZONE) {
- /* add space for red zone words */
- cachep->obj_offset += sizeof(unsigned long long);
- size += 2 * sizeof(unsigned long long);
- }
- if (flags & SLAB_STORE_USER) {
- /* user store requires one word storage behind the end of
- * the real object. But if the second red zone needs to be
- * aligned to 64 bits, we must allow that much space.
- */
- if (flags & SLAB_RED_ZONE)
- size += REDZONE_ALIGN;
- else
- size += BYTES_PER_WORD;
- }
- #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE – size;
- size = PAGE_SIZE;
- }
- #endif
- #endif
- /*
- * Determine if the slab management is ‘on’ or ‘off’ slab.
- * (bootstrapping cannot cope with offslab caches so don’t do
- * it too early on.)
- */
- /* 确定slab管理对象的存储方式:内置还是外置
- 。通常,当对象大于等于512时,使用外置方式
- 。初始化阶段采用内置式。
- slab_early_init 参见kmem_cache_init函数 */
- if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
- /*
- * Size is large, assume best to place the slab management obj
- * off-slab (should allow better packing of objs).
- */
- flags |= CFLGS_OFF_SLAB;
- size = ALIGN(size, align);
- /* 获得slab中碎片的大小 */
- left_over = calculate_slab_order(cachep, size, align, flags);
- /* cachep->num为该cache中每个slab的对象数,为0,表示为该对象创建cache失败 */
- if (!cachep->num) {
- printk(KERN_ERR
- “kmem_cache_create: couldn’t create cache %s.\n”, name);
- kmem_cache_free(&cache_cache, cachep);
- cachep = NULL;
- goto oops;
- }
- /* 计算slab管理对象的大小,包括struct slab对象和kmem_bufctl_t数组 */
- slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
- + sizeof(struct slab), align);
- /*
- * If the slab has been placed off-slab, and we have enough space then
- * move it on-slab. This is at the expense of any extra colouring.
- */
- /* 如果这是一个外置式slab,并且碎片大小大于slab管理对象的大小
- ,则可将slab管理对象移到slab中,改造成一个内置式slab */
- if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
- /* 除去off-slab标志位 */
- flags &= ~CFLGS_OFF_SLAB;
- /* 更新碎片大小 */
- left_over -= slab_size;
- }
- if (flags & CFLGS_OFF_SLAB) {
- /* really off slab. No need for manual alignment */
- /* align是针对slab对象的,如果slab管理对象是外置存储
- ,自然不会像内置那样影响到后面slab对象的存储位置
- ,也就不需要对齐了 */
- slab_size =
- cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
- #ifdef CONFIG_PAGE_POISONING
- /* If we’re going to use the generic kernel_map_pages()
- * poisoning, then it’s going to smash the contents of
- * the redzone and userword anyhow, so switch them off.
- */
- if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
- #endif
- }
- /* cache的着色块的单位大小 */
- cachep->colour_off = cache_line_size();
- /* Offset must be a multiple of the alignment. */
- /* 着色块大小必须是对象要求对齐方式的倍数 */
- if (cachep->colour_off < align)
- cachep->colour_off = align;
- /* 计算碎片区需要多少个着色快 */
- cachep->colour = left_over / cachep->colour_off;
- /* slab管理对象的大小 */
- cachep->slab_size = slab_size;
- cachep->flags = flags;
- cachep->gfpflags = 0;
- if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
- cachep->gfpflags |= GFP_DMA;
- /* slab对象的大小 */
- cachep->buffer_size = size;
- /* 计算对象在slab中索引时用,参见obj_to_index函数 */
- cachep->reciprocal_buffer_size = reciprocal_value(size);
- if (flags & CFLGS_OFF_SLAB) {
- /* 分配一个slab管理区域对象,保存在slabp_cache中,
- 这个函数传入的大小为slab_size,也就是分配slab_size大小的cache
- ,在slab创建的时候如果是外置式,那么需要从分配的这里面
- 分配出slab对象,剩下的空间放kmem_bufctl_t[]数组,
- 如果是内置式的slab,此指针为空 */
- cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
- /*
- * This is a possibility for one of the malloc_sizes caches.
- * But since we go off slab only for object size greater than
- * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
- * this should not happen at all.
- * But leave a BUG_ON for some lucky dude.
- */
- BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
- }
- cachep->ctor = ctor;
- cachep->name = name;
- /* 设置每个cpu上的local cache */
- if (setup_cpu_cache(cachep, gfp)) {
- __kmem_cache_destroy(cachep);
- cachep = NULL;
- goto oops;
- }
- /* cache setup completed, link it into the list */
- /* cache创建完毕,将其加入到全局slab cache链表中 */
- list_add(&cachep->next, &cache_chain);
- oops:
- if (!cachep && (flags & SLAB_PANIC))
- panic(“kmem_cache_create(): failed to create slab `%s’\n”,
- name);
- if (slab_is_available()) {
- mutex_unlock(&cache_chain_mutex);
- put_online_cpus();
- }
- return cachep;
- }
其中,cache_cache
[cpp]
- /* internal cache of cache description objs */
- static struct kmem_cache cache_cache = {
- .batchcount = 1,
- .limit = BOOT_CPUCACHE_ENTRIES,
- .shared = 1,
- .buffer_size = sizeof(struct kmem_cache),/*大小为cache结构,难怪名称为cache_cache*/
- .name = “kmem_cache”,
- };
二、计算slab碎片大小
[cpp]
- /**
- * calculate_slab_order – calculate size (page order) of slabs
- * @cachep: pointer to the cache that is being created
- * @size: size of objects to be created in this cache.
- * @align: required alignment for the objects.
- * @flags: slab allocation flags
- *
- * Also calculates the number of objects per slab.
- *
- * This could be made much more intelligent. For now, try to avoid using
- * high order pages for slabs. When the gfp() functions are more friendly
- * towards high-order requests, this should be changed.
- */
- /*计算slab由几个页面组成,同时计算每个slab中有多少对象*/
- static size_t calculate_slab_order(struct kmem_cache *cachep,
- size_t size, size_t align, unsigned long flags)
- {
- unsigned long offslab_limit;
- size_t left_over = 0;
- int gfporder;
- for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
- unsigned int num;
- size_t remainder;
- /* 计算slab中对象数 */
- cache_estimate(gfporder, size, align, flags, &remainder, &num);
- /* 对象数为0,表示此order下,一个对象都放不下,检查下一order */
- if (!num)
- continue;
- if (flags & CFLGS_OFF_SLAB) {
- /*
- * Max number of objs-per-slab for caches which
- * use off-slab slabs. Needed to avoid a possible
- * looping condition in cache_grow().
- */
- /* 创建一个外置式slab时,要相应分配该slab的管理对象
- ,包含struct slab对象和kmem_bufctl_t数组,分配管理对象的流程就是分配普通对象的流程
- ,再来看一下分配对象的流程:
- kmem_cache_alloc->__cache_alloc-> __do_cache_alloc-> ____cache_alloc-> cache_alloc_refill-> cache_grow-> alloc_slabmgmt-> kmem_cache_alloc_node-> kmem_cache_alloc
- 可以看出这里可能存在一个循环,循环的关键在于alloc_slabmgmt函数
- ,当slab管理对象是off-slab方式时,就形成了循环
- 。那么什么时候slab管理对象会采用外置式slab呢?显然当其管理的slab中对象很多
- ,从而kmem_bufctl_t数组很大,致使整个管理对象也很大,此时才会形成循环
- 。故需要对kmem_bufctl_t的数目做限制,下面的算法是很粗略的,既然对象大小为size时
- ,是外置式slab,那么我们假设管理对象的大小也是size,计算出kmem_bufctl_t数组的大小
- ,即此大小的kmem_bufctl_t数组一定会造成管理对象是外置式slab。之所以说粗略
- ,是指数组大小小于这个限制时,也不能确保管理对象一定是内置式slab。但这也不会引发错误
- ,因为还有一个slab_break_gfp_order阀门来控制每个slab所占页面数,通常其值为1,即每个slab最多两个页面
- ,外置式slab存放的都是大于512的大对象,所以
- slab中不会有太多的大对象,kmem_bufctl_t数组也不会很大,粗略判断一下就足够了。
- */
- offslab_limit = size – sizeof(struct slab);
- offslab_limit /= sizeof(kmem_bufctl_t);
- /* 对象数目大于限制,跳出循环,不再尝试更大的order
- ,避免slab中对象数目过多
- ,此时计算的对象数也是有效的,循环一次没什么 */
- if (num > offslab_limit)
- break;
- }
- /* Found something acceptable – save it away */
- /* 每个slab中的对象数 */
- cachep->num = num;
- /* slab的order,即由几个页面组成 */
- cachep->gfporder = gfporder;
- /* slab中剩余空间(碎片)的大小 */
- left_over = remainder;
- /*
- * A VFS-reclaimable slab tends to have most allocations
- * as GFP_NOFS and we really don’t want to have to be allocating
- * higher-order pages when we are unable to shrink dcache.
- */
- /* SLAB_RECLAIM_ACCOUNT表示此slab所占页面为可回收的
- ,当内核检测是否有足够的页面满足用户态的需求时
- ,此类页面将被计算在内,通过调用
- kmem_freepages()函数可以释放分配给slab的页框。由于是可回收的
- ,所以不需要做后面的碎片检测了 */
- if (flags & SLAB_RECLAIM_ACCOUNT)
- break;
- /*
- * Large number of objects is good, but very large slabs are
- * currently bad for the gfp()s.
- */
- /* slab_break_gfp_order为slab所占页面的阀门,超过这个阀门时
- ,无论碎片大小,都不再检测更高的order了 */
- if (gfporder >= slab_break_gfp_order)
- break;
- /*
- * Acceptable internal fragmentation?
- */
- /* slab所占页面的大小是碎片大小的8倍以上
- ,页面利用率较高,可以接受这样的order */
- if (left_over * 8 <= (PAGE_SIZE << gfporder))
- break;
- }
- /* 返回碎片大小 */
- return left_over;
- }
三、查找指定大小cache
[cpp]
- /*在general cache中分配一个struct kmem_cache对象。直接调用__find_general_cachep。*/
- static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
- {
- return __find_general_cachep(size, gfpflags);
- }
[cpp]
- static inline struct kmem_cache *__find_general_cachep(size_t size,
- gfp_t gfpflags)
- {
- struct cache_sizes *csizep = malloc_sizes;
- #if DEBUG
- /* This happens if someone tries to call
- * kmem_cache_create(), or __kmalloc(), before
- * the generic caches are initialized.
- */
- BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
- #endif
- if (!size)
- return ZERO_SIZE_PTR;
- /* 找到合适的malloc size */
- while (size > csizep->cs_size)
- csizep++;
- /*
- * Really subtle: The last entry with cs->cs_size==ULONG_MAX
- * has cs_{dma,}cachep==NULL. Thus no special case
- * for large kmalloc calls required.
- */
- #ifdef CONFIG_ZONE_DMA
- if (unlikely(gfpflags & GFP_DMA))
- return csizep->cs_dmacachep;
- #endif
- /* 返回该大小级别的cache */
- return csizep->cs_cachep;
- }
四、设置CPU本地cache
[cpp]
- /*配置local cache和slab三链。*/
- static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
- {
- /* general cache初始化完毕,配置每个cpu的local cache */
- if (g_cpucache_up == FULL)
- return enable_cpucache(cachep, gfp);
- /* 此时处于系统初始化阶段,g_cpucache_up记录general cache初始化的进度
- ,比如PARTIAL_AC表示struct array_cache所在的cache已经创建,
- PARTIAL_L3表示struct kmem_list3所在的cache已经创建
- ,注意创建这两个cache的先后顺序
- 。在初始化阶段只需配置主cpu的local cache和slab三链 */
- if (g_cpucache_up == NONE) {
- /*
- * Note: the first kmem_cache_create must create the cache
- * that’s used by kmalloc(24), otherwise the creation of
- * further caches will BUG().
- */
- /* 初始化阶段创建struct array_cache所在cache时进入这个流程
- ,此时struct array_cache所在的general cache还未创建
- ,只能使用静态分配的全局变量initarray_generic表示的local cache */
- cachep->array[smp_processor_id()] = &initarray_generic.cache;
- /*
- * If the cache that’s used by kmalloc(sizeof(kmem_list3)) is
- * the first cache, then we need to set up all its list3s,
- * otherwise the creation of further caches will BUG().
- */
- /* 创建struct kmem_list3所在的cache是在struct array_cache所在cache之后
- ,所以此时struct kmem_list3所在的
- cache也一定没有创建,也需要使用全局变量 */
- set_up_list3s(cachep, SIZE_AC);
- /* 执行到这struct array_cache所在的cache创建完毕
- ,如果struct kmem_list3和struct array_cache位于同一个general cache中
- ,不会再重复创建了
- ,g_cpucache_up表示的进度更进一步 */
- if (INDEX_AC == INDEX_L3)
- g_cpucache_up = PARTIAL_L3;
- else
- g_cpucache_up = PARTIAL_AC;
- } else {
- /* g_cpucache_up至少为PARTIAL_AC时进入这个流程,struct array_cache所在的
- general cache已经建立起来,可以通过kmalloc分配了 */
- cachep->array[smp_processor_id()] =
- kmalloc(sizeof(struct arraycache_init), gfp);
- if (g_cpucache_up == PARTIAL_AC) {
- /* struct kmem_list3所在cache仍未创建完毕,还需使用全局的slab三链 */
- set_up_list3s(cachep, SIZE_L3);
- /* 后面将会分析kmem_cache_init函数,只有创建struct kmem_list3所在
- cache时才会进入此流程,上面的代码执行完,struct kmem_list3所在
- cache也就创建完毕可以使用了,更新g_cpucache_up */
- g_cpucache_up = PARTIAL_L3;
- } else {
- int node;
- for_each_online_node(node) {
- cachep->nodelists[node] =/* 通过kmalloc分配struct kmem_list3对象 */
- kmalloc_node(sizeof(struct kmem_list3),
- gfp, node);
- BUG_ON(!cachep->nodelists[node]);
- /* 初始化slab三链 */
- kmem_list3_init(cachep->nodelists[node]);
- }
- }
- }
- /* 设置回收时间 */
- cachep->nodelists[numa_node_id()]->next_reap =
- jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
- cpu_cache_get(cachep)->avail = 0;
- cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
- cpu_cache_get(cachep)->batchcount = 1;
- cpu_cache_get(cachep)->touched = 0;
- cachep->batchcount = 1;
- cachep->limit = BOOT_CPUCACHE_ENTRIES;
- return 0;
- }