1. 简介
对于mmap在用户态通过函数以下函数进行调用:
- void* mmap( void* addr, size_t size, int prot, int flags, int fd, long offset )
然后进入系统调用。
2. Kernel mmap实现
1)然后进入系统调用,其系统调用号为:
kernel/arch/arm/include/asm/unistd.h
#define __NR_mmap2 (__NR_SYSCALL_BASE+192)
2)触发软中断
其ISR 代码位于kernel/arch/arm/kernel/entry-common.S的ENTRY(vector_swi), __NR_mmap2对应的函数为:sys_mmap2(位于linux/arch/arm/kernel/calls.S)
3)sys_mmap2的实现
位于kernel/arch/arm/kernel/entry-common.S,实现代码如下:
- /*
- * Note: off_4k (r5) is always units of 4K. If we can’t do the requested
- * offset, we return EINVAL.
- */
- sys_mmap2:
- #if PAGE_SHIFT > 12
- tst r5, #PGOFF_MASK
- moveq r5, r5, lsr #PAGE_SHIFT – 12
- streq r5, [sp, #4]
- beq sys_mmap_pgoff
- mov r0, #-EINVAL
- mov pc, lr
- #else
- str r5, [sp, #4]
- b sys_mmap_pgoff
- #endif
4) 调用sys_mmap_pgoff
在kernel/include/linux/syscalls.h中定义如下:
- asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff);
6)sys_mmap_pgoff实现
在kernel/mm/mmap.c中实现如下:
- SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
- unsigned long, prot, unsigned long, flags,
- unsigned long, fd, unsigned long, pgoff)
- {
- struct file *file = NULL;
- unsigned long retval = -EBADF;
- if (!(flags & MAP_ANONYMOUS)) {
- audit_mmap_fd(fd, flags);
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
- file = fget(fd);
- if (!file)
- goto out;
- } else if (flags & MAP_HUGETLB) {
- struct user_struct *user = NULL;
- /*
- * VM_NORESERVE is used because the reservations will be
- * taken when vm_ops->mmap() is called
- * A dummy user value is used because we are not locking
- * memory so no accounting is necessary
- */
- len = ALIGN(len, huge_page_size(&default_hstate));
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE);
- if (IS_ERR(file))
- return PTR_ERR(file);
- }
- flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- down_write(¤t->mm->mmap_sem);
- retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
- up_write(¤t->mm->mmap_sem);
- if (file)
- fput(file);
- out:
- return retval;
- }
其功能为:从当前进程中获取用户态可用的虚拟地址空间(vm_area_struct *vma),在mmap_region中真正获取vma,然后调用file->f_op->mmap(file, vma),调用具体的支持mmap的驱动来处理。
下面以binder驱动为例。
3. binder mmap实现binder驱动的mmap函数为:binder_mmap,其实现代码如下:
- static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
- {
- int ret;
- struct vm_struct *area;
- struct binder_proc *proc = filp->private_data;
- const char *failure_string;
- struct binder_buffer *buffer;
- if ((vma->vm_end – vma->vm_start) > SZ_4M)
- vma->vm_end = vma->vm_start + SZ_4M;
- binder_debug(BINDER_DEBUG_OPEN_CLOSE,
- “binder_mmap: %d %lx-%lx (%ld K) vma %lx pagep %lx\n”,
- proc->pid, vma->vm_start, vma->vm_end,
- (vma->vm_end – vma->vm_start) / SZ_1K, vma->vm_flags,
- (unsigned long)pgprot_val(vma->vm_page_prot));
- if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) {
- ret = -EPERM;
- failure_string = “bad vm_flags”;
- goto err_bad_arg;
- }
- vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
- if (proc->buffer) {
- ret = -EBUSY;
- failure_string = “already mapped”;
- goto err_already_mapped;
- }
- area = get_vm_area(vma->vm_end – vma->vm_start, VM_IOREMAP);
- if (area == NULL) {
- ret = -ENOMEM;
- failure_string = “get_vm_area”;
- goto err_get_vm_area_failed;
- }
- proc->buffer = area->addr;
- proc->user_buffer_offset = vma->vm_start – (uintptr_t)proc->buffer;
- #ifdef CONFIG_CPU_CACHE_VIPT
- if (cache_is_vipt_aliasing()) {
- while (CACHE_COLOUR((vma->vm_start ^ (uint32_t)proc->buffer))) {
- printk(KERN_INFO “binder_mmap: %d %lx-%lx maps %p bad alignment\n”, proc->pid, vma->vm_start, vma->vm_end, proc->buffer);
- vma->vm_start += PAGE_SIZE;
- }
- }
- #endif
- proc->pages = kzalloc(sizeof(proc->pages[0]) * ((vma->vm_end – vma->vm_start) / PAGE_SIZE), GFP_KERNEL);
- if (proc->pages == NULL) {
- ret = -ENOMEM;
- failure_string = “alloc page array”;
- goto err_alloc_pages_failed;
- }
- proc->buffer_size = vma->vm_end – vma->vm_start;
- vma->vm_ops = &binder_vm_ops;
- vma->vm_private_data = proc;
- if (binder_update_page_range(proc, 1, proc->buffer, proc->buffer + PAGE_SIZE, vma)) {
- ret = -ENOMEM;
- failure_string = “alloc small buf”;
- goto err_alloc_small_buf_failed;
- }
- buffer = proc->buffer;
- INIT_LIST_HEAD(&proc->buffers);
- list_add(&buffer->entry, &proc->buffers);
- buffer->free = 1;
- binder_insert_free_buffer(proc, buffer);
- proc->free_async_space = proc->buffer_size / 2;
- barrier();
- proc->files = get_files_struct(current);
- proc->vma = vma;
- /*printk(KERN_INFO “binder_mmap: %d %lx-%lx maps %p\n”,
- proc->pid, vma->vm_start, vma->vm_end, proc->buffer);*/
- return 0;
- err_alloc_small_buf_failed:
- kfree(proc->pages);
- proc->pages = NULL;
- err_alloc_pages_failed:
- vfree(proc->buffer);
- proc->buffer = NULL;
- err_get_vm_area_failed:
- err_already_mapped:
- err_bad_arg:
- printk(KERN_ERR “binder_mmap: %d %lx-%lx %s failed %d\n”,
- proc->pid, vma->vm_start, vma->vm_end, failure_string, ret);
- return ret;
- }
1)获取kernel态虚拟地址空间:
struct vm_struct *area;
area = get_vm_area(vma->vm_end – vma->vm_start, VM_IOREMAP);
根据传过来的vma(数据结构为vm_area_struct,属于进程的一段空间,用于与内核空间映射用的),调用get_vm_area在内核的vmalloc区域获得一个相同大小的连续空间,数据结构为vm_struct,同时将该结构加入到vm_list统一管理
2)保存kernel态虚拟地址空间的起始地址,以便后面使用:
proc->buffer = area->addr;
3) 计算并保存进程用户态虚拟地址空间起始地址与kernel态虚拟地址空间的起始地址的差值, 以便后面使用。
proc->user_buffer_offset = vma->vm_start – (uintptr_t)proc->buffer;
4)分配物理页表项(struct page)
proc->pages = kzalloc(sizeof(proc->pages[0]) * ((vma->vm_end – vma->vm_start) / PAGE_SIZE), GFP_KERNEL);
5)binder_update_page_range
它的工作为:
a)分配物理页
b)分别对vma用户空间建立页表、对vmalloc区域建立页表映射关系。
前面有了用户态和Kernel态的虚拟地址空间,但是还不能访问,因为还没有对应的物理内存。
补充知识:
a)struct page用于跟踪描述一个物理页面是否正在被使用。所有的page结构将都被存入一个叫做mem_map的全局数组中.
b)在每个进程的task_struct中包含一个指向mm_struct结构的指针.进程的mm_struct中则包含了进程可执行影像的页目录指针pgd.还包含了指向vm_area_struct的几个指针,每个vm_area_struct包含一个进程的虚拟地址区域.
binder_update_page_range(proc, 1, proc->buffer, proc->buffer + PAGE_SIZE, vma)
proc->buffer指向内核的vmalloc 区域的起始地址,前面已经有了vma(vm_area_struct)和 area(vm_struct)。binder_update_page_range实现代码如下:
- static int binder_update_page_range(struct binder_proc *proc, int allocate,
- void *start, void *end,
- struct vm_area_struct *vma)
- {
- void *page_addr;
- unsigned long user_page_addr;
- struct vm_struct tmp_area;
- struct page **page;
- struct mm_struct *mm;
- binder_debug(BINDER_DEBUG_BUFFER_ALLOC,
- “binder: %d: %s pages %p-%p\n”, proc->pid,
- allocate ? “allocate” : “free”, start, end);
- if (end <= start)
- return 0;
- if (vma)
- mm = NULL;
- else
- mm = get_task_mm(proc->tsk);
- if (mm) {
- down_write(&mm->mmap_sem);
- vma = proc->vma;
- }
- if (allocate == 0)
- goto free_range;
- if (vma == NULL) {
- printk(KERN_ERR “binder: %d: binder_alloc_buf failed to “
- “map pages in userspace, no vma\n”, proc->pid);
- goto err_no_vma;
- }
- for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) {
- int ret;
- struct page **page_array_ptr;
- page = &proc->pages[(page_addr – proc->buffer) / PAGE_SIZE];
- BUG_ON(*page);
- //分配一个物理页
- *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (*page == NULL) {
- printk(KERN_ERR “binder: %d: binder_alloc_buf failed “
- “for page at %p\n”, proc->pid, page_addr);
- goto err_alloc_page_failed;
- }
- tmp_area.addr = page_addr;
- tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */;
- page_array_ptr = page;
- //根据kernel态的虚拟地址,分配对应的pud, pmd和pte并填充对应的值
- //以使根据虚拟地址,可以通过pgd, pud, pmd和pte寻址到对应的物理存储单元
- ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr);
- if (ret) {
- printk(KERN_ERR “binder: %d: binder_alloc_buf failed “
- “to map page at %p in kernel\n”,
- proc->pid, page_addr);
- goto err_map_kernel_failed;
- }
- user_page_addr =
- (uintptr_t)page_addr + proc->user_buffer_offset;
- //根据用户态的虚拟地址,插入一页到用户空间的vma,
- //从而用户空间访问从user_page_addr开始的一页内存时,
- //从而可以访问到与page对应的物理页中对应的存储单元
- ret = vm_insert_page(vma, user_page_addr, page[0]);
- if (ret) {
- printk(KERN_ERR “binder: %d: binder_alloc_buf failed “
- “to map page at %lx in userspace\n”,
- proc->pid, user_page_addr);
- goto err_vm_insert_page_failed;
- }
- /* vm_insert_page does not seem to increment the refcount */
- }
- if (mm) {
- up_write(&mm->mmap_sem);
- mmput(mm);
- }
- return 0;
- free_range:
- for (page_addr = end – PAGE_SIZE; page_addr >= start;
- page_addr -= PAGE_SIZE) {
- page = &proc->pages[(page_addr – proc->buffer) / PAGE_SIZE];
- if (vma)
- zap_page_range(vma, (uintptr_t)page_addr +
- proc->user_buffer_offset, PAGE_SIZE, NULL);
- err_vm_insert_page_failed:
- unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
- err_map_kernel_failed:
- __free_page(*page);
- *page = NULL;
- err_alloc_page_failed:
- ;
- }
- err_no_vma:
- if (mm) {
- up_write(&mm->mmap_sem);
- mmput(mm);
- }
- return -ENOMEM;
- }
a) map_vm_area: 映射Kernel虚拟地址到物理内存,为vmalloc 区域的连续地址空间进行页表映射,当然需要vm_struct (提供虚拟地址)参数和 page参数(用来make pte的),这就完成了内核区的映射
b) vm_insert_page: 更新vma对应的页表,这样就是实现了mmap功能
c)binder_update_page_range(proc, 1, proc->buffer, proc->buffer + PAGE_SIZE, vma)调用的时候只分配了1页,这个是为了节约空间,按需分配。而进程虚拟空间和vmalloc内核空间按需要分配,反正它不占用实际物理内存,所以开始就占用了所需的全部空间,而实际的物理页按需获取;
proc->vma为调用进程的一段用户空间;
proc->files为调用进程的files_struct结构;
proc->buffer_size为需要映射的长度(小于4m)-sizeof(struct binder_buffer);
proc->pages为分配的物理页page的指针数组,开始只有一项,即1页,但是长度还是预留好了;
proc->buffer为内核连续映射区首地址 ;
proc->user_buffer_offset 为用户空间映射区首地址-内核空间连续映射的首地址。